mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-29 01:42:01 -05:00
chore(style): reduce line length and truncate compression
Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -102,7 +102,9 @@ else:
|
||||
try:
|
||||
if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
|
||||
except exceptions.MissingDependencyError:
|
||||
_import_structure["utils.dummy_pt_objects"] = [name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")]
|
||||
_import_structure["utils.dummy_pt_objects"] = [
|
||||
name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
|
||||
]
|
||||
else:
|
||||
_import_structure["models.flan_t5"].extend(["FlanT5"])
|
||||
_import_structure["models.dolly_v2"].extend(["DollyV2"])
|
||||
|
||||
@@ -12,9 +12,24 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
|
||||
try:
|
||||
return bentoml.transformers.get(ids)
|
||||
except bentoml.exceptions.NotFound:
|
||||
model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")}
|
||||
with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel:
|
||||
snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors", "*.h5", "*.ot", "*.pdf", "*.md", ".gitattributes", "LICENSE.txt"])
|
||||
model_signatures = {
|
||||
k: ModelSignature(batchable=False)
|
||||
for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")
|
||||
}
|
||||
with bentoml.models.create(
|
||||
ids,
|
||||
module=MODULE_NAME,
|
||||
api_version=API_VERSION,
|
||||
options=ModelOptions(),
|
||||
context=openllm.utils.generate_context(framework_name="transformers"),
|
||||
labels={
|
||||
"runtime": "pt", "framework": "openllm"
|
||||
},
|
||||
signatures=model_signatures
|
||||
) as bentomodel:
|
||||
snapshot_download(
|
||||
_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors", "*.h5", "*.ot", "*.pdf", "*.md", ".gitattributes", "LICENSE.txt"]
|
||||
)
|
||||
return bentomodel
|
||||
class GenericEmbeddingRunnable(bentoml.Runnable):
|
||||
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
|
||||
|
||||
@@ -1,16 +1,12 @@
|
||||
from __future__ import annotations
|
||||
import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc, pathlib, abc
|
||||
from huggingface_hub import hf_hub_download
|
||||
from bentoml._internal.models.model import ModelSignature
|
||||
|
||||
from openllm_core._configuration import FineTuneConfig, LLMConfig, _object_getattribute, _setattr_class
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from openllm_core._schema import unmarshal_vllm_outputs
|
||||
from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
|
||||
from .models.auto import AutoConfig
|
||||
from openllm_core.utils import DEBUG, ENV_VARS_TRUE_VALUES, MYPY, EnvVarMixin, LazyLoader, ReprMixin, apply, bentoml_cattr, codegen, device_count, first_not_none, generate_hash_from_file, is_peft_available, is_torch_available, non_intrusive_setattr, normalize_attrs_to_model_tokenizer_pair, resolve_filepath, validate_is_path
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
|
||||
from .utils import infer_auto_class
|
||||
from openllm_core._typing_compat import AdaptersMapping, AdaptersTuple, AnyCallable, AdapterType, LiteralRuntime, DictStrAny, ListStr, LLMEmbeddings, LLMRunnable, LLMRunner, ModelSignatureDict as _ModelSignatureDict, PeftAdapterOutput, TupleAny, NotRequired, overload, M, T, LiteralString
|
||||
|
||||
@@ -68,7 +64,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp
|
||||
resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),)
|
||||
return resolved
|
||||
_reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"}
|
||||
class LLMInterface(ABC, t.Generic[M, T]):
|
||||
class LLMInterface(abc.ABC, t.Generic[M, T]):
|
||||
"""This defines the loose contract for all openllm.LLM implementations."""
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None:
|
||||
@@ -91,7 +87,7 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
@abc.abstractmethod
|
||||
def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any:
|
||||
"""The implementation for text generation from given prompt.
|
||||
|
||||
@@ -141,7 +137,7 @@ class LLMInterface(ABC, t.Generic[M, T]):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def save_pretrained(self, save_directory: str | Path, **attrs: t.Any) -> None:
|
||||
def save_pretrained(self, save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
|
||||
"""This function defines how this model can be saved to local store.
|
||||
|
||||
This will be called during ``import_model``. By default, it will use ``openllm.serialisation.save_pretrained``.
|
||||
@@ -234,7 +230,7 @@ class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol):
|
||||
def __call__(self, llm: LLM[M, T]) -> T:
|
||||
...
|
||||
class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol):
|
||||
def __call__(self, llm: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None:
|
||||
def __call__(self, llm: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
|
||||
...
|
||||
_object_setattr = object.__setattr__
|
||||
# NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation.
|
||||
@@ -250,7 +246,9 @@ def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Ca
|
||||
return wrapper
|
||||
_DEFAULT_TOKENIZER = "hf-internal-testing/llama-tokenizer"
|
||||
def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs:
|
||||
return vllm.EngineArgs(model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode="auto", tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype="auto", worker_use_ray=False)
|
||||
return vllm.EngineArgs(
|
||||
model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode="auto", tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype="auto", worker_use_ray=False
|
||||
)
|
||||
def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
|
||||
@functools.wraps(f)
|
||||
def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
|
||||
@@ -279,12 +277,13 @@ def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M
|
||||
f(self)
|
||||
|
||||
return wrapper
|
||||
def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | Path], None]:
|
||||
def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | pathlib.Path], None]:
|
||||
@functools.wraps(f)
|
||||
def wrapper(self: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None:
|
||||
if isinstance(save_directory, Path): save_directory = str(save_directory)
|
||||
def wrapper(self: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
|
||||
if isinstance(save_directory, pathlib.Path): save_directory = str(save_directory)
|
||||
if self.__llm_model__ is None: raise RuntimeError("Cannot 'save_pretrained' with unload model instance.")
|
||||
if self.bettertransformer and self.__llm_implementation__ == "pt": _object_setattr(self, "__llm_model__", t.cast("transformers.PreTrainedModel", self.__llm_model__).reverse_bettertransformer())
|
||||
if self.bettertransformer and self.__llm_implementation__ == "pt":
|
||||
_object_setattr(self, "__llm_model__", t.cast("transformers.PreTrainedModel", self.__llm_model__).reverse_bettertransformer())
|
||||
f(self, save_directory, **attrs)
|
||||
|
||||
return wrapper
|
||||
@@ -300,7 +299,13 @@ def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
|
||||
setattr(cls, fn, original_fn)
|
||||
return original_fn
|
||||
def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
|
||||
attributes = {"import_model": _wrapped_import_model, "load_model": _wrapped_load_model, "load_tokenizer": _wrapped_load_tokenizer, "llm_post_init": _wrapped_llm_post_init, "save_pretrained": _wrapped_save_pretrained}
|
||||
attributes = {
|
||||
"import_model": _wrapped_import_model,
|
||||
"load_model": _wrapped_load_model,
|
||||
"load_tokenizer": _wrapped_load_tokenizer,
|
||||
"llm_post_init": _wrapped_llm_post_init,
|
||||
"save_pretrained": _wrapped_save_pretrained
|
||||
}
|
||||
args: ListStr = []
|
||||
anns: DictStrAny = {}
|
||||
lines: ListStr = []
|
||||
@@ -372,7 +377,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
cd = cls.__dict__
|
||||
implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
|
||||
cls.__llm_implementation__ = implementation
|
||||
config_class = AutoConfig.infer_class_from_name(config_class_name)
|
||||
config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
|
||||
if "__openllm_internal__" in cd:
|
||||
if "config_class" not in cd: cls.config_class = config_class
|
||||
elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
|
||||
@@ -532,11 +537,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
return f"{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}"
|
||||
|
||||
tag_name = f"{cls.__llm_implementation__}-{model_name}"
|
||||
if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
|
||||
if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES:
|
||||
return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
|
||||
if validate_is_path(model_id): model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
|
||||
else:
|
||||
from .serialisation.transformers._helpers import process_config
|
||||
model_version = getattr(process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))[0], "_commit_hash", None)
|
||||
model_version = getattr(
|
||||
process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))[0], "_commit_hash", None
|
||||
)
|
||||
if model_version is None: raise ValueError(f"Internal errors when parsing config for pretrained '{model_id}' ('commit_hash' not found)")
|
||||
return f"{tag_name}:{model_version}"
|
||||
|
||||
@@ -544,7 +552,22 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
|
||||
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
|
||||
|
||||
def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str, _serialisation_format: t.Literal["safetensors", "legacy"], _local: bool, **attrs: t.Any,):
|
||||
def __init__(
|
||||
self,
|
||||
*args: t.Any,
|
||||
model_id: str,
|
||||
llm_config: LLMConfig,
|
||||
bettertransformer: bool | None,
|
||||
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
|
||||
_adapters_mapping: AdaptersMapping | None,
|
||||
_tag: bentoml.Tag,
|
||||
_quantize_method: t.Literal["int8", "int4", "gptq"] | None,
|
||||
_runtime: t.Literal["ggml", "transformers"],
|
||||
_model_version: str,
|
||||
_serialisation_format: t.Literal["safetensors", "legacy"],
|
||||
_local: bool,
|
||||
**attrs: t.Any,
|
||||
):
|
||||
"""Initialize the LLM with given pretrained model.
|
||||
|
||||
> [!WARNING]
|
||||
@@ -641,10 +664,28 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
# parsing tokenizer and model kwargs, as the hierachy is param pass > default
|
||||
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
|
||||
# NOTE: Save the args and kwargs for latter load
|
||||
self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
|
||||
self.__attrs_init__(
|
||||
llm_config,
|
||||
quantization_config,
|
||||
model_id,
|
||||
_runtime,
|
||||
args, {
|
||||
**model_kwds, **normalized_model_kwds
|
||||
}, {
|
||||
**tokenizer_kwds, **normalized_tokenizer_kwds
|
||||
},
|
||||
_tag,
|
||||
_adapters_mapping,
|
||||
_model_version,
|
||||
_quantize_method,
|
||||
_serialisation_format,
|
||||
_local
|
||||
)
|
||||
# handle trust_remote_code
|
||||
_from_env = os.getenv("TRUST_REMOTE_CODE", None)
|
||||
self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]))
|
||||
self.__llm_trust_remote_code__ = first_not_none(
|
||||
str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"])
|
||||
)
|
||||
|
||||
self.llm_post_init()
|
||||
# we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
|
||||
@@ -654,7 +695,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
if _adapters_mapping and self.bettertransformer is True: self.bettertransformer = False
|
||||
|
||||
def __setattr__(self, attr: str, value: t.Any) -> None:
|
||||
if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.")
|
||||
if attr in _reserved_namespace:
|
||||
raise ForbiddenAttributeError(
|
||||
f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}."
|
||||
)
|
||||
super().__setattr__(attr, value)
|
||||
|
||||
@property
|
||||
@@ -704,7 +748,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
return self._tag
|
||||
|
||||
def ensure_model_id_exists(self) -> bentoml.Model:
|
||||
return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format)
|
||||
return openllm.import_model(
|
||||
self.config["start_name"],
|
||||
model_id=self.model_id,
|
||||
model_version=self._model_version,
|
||||
runtime=self.runtime,
|
||||
implementation=self.__llm_implementation__,
|
||||
quantize=self._quantize_method,
|
||||
serialisation_format=self._serialisation_format
|
||||
)
|
||||
|
||||
@property
|
||||
def _bentomodel(self) -> bentoml.Model:
|
||||
@@ -747,7 +799,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
try:
|
||||
model = model.to("cuda")
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
|
||||
raise OpenLLMException(
|
||||
f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information."
|
||||
) from err
|
||||
self.__llm_model__ = model
|
||||
return self.__llm_model__
|
||||
|
||||
@@ -758,7 +812,9 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
return self.__llm_tokenizer__
|
||||
|
||||
def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig:
|
||||
strategy = first_not_none(self.config["fine_tune_strategies"].get(_adapter_type), default=FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), llm_config_class=self.config_class))
|
||||
strategy = first_not_none(
|
||||
self.config["fine_tune_strategies"].get(_adapter_type), default=FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), llm_config_class=self.config_class)
|
||||
)
|
||||
return strategy.eval() if inference_mode else strategy.train()
|
||||
|
||||
def _transpose_adapter_mapping(self, inference_mode: bool = True, use_cache: bool = True) -> ResolvedAdaptersMapping:
|
||||
@@ -773,19 +829,24 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
for _adapter_type, _adapters_tuples in self._adapters_mapping.items():
|
||||
default_config = self._default_ft_config(_adapter_type, inference_mode)
|
||||
for adapter in _adapters_tuples:
|
||||
if not adapter.name and _converted_first_none: raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}")
|
||||
if not adapter.name and _converted_first_none:
|
||||
raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}")
|
||||
name = adapter.name
|
||||
if name is None:
|
||||
_converted_first_none = True
|
||||
name = "default"
|
||||
peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == "default" else FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class).to_peft_config()
|
||||
peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == "default" else FineTuneConfig(
|
||||
adapter_type=t.cast("PeftType", _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class
|
||||
).to_peft_config()
|
||||
adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
|
||||
if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map
|
||||
return adapter_map
|
||||
|
||||
def prepare_for_training(self, adapter_type: AdapterType = "lora", use_gradient_checkpointing: bool = True, **attrs: t.Any) -> tuple[peft.PeftModel, T]:
|
||||
from peft import prepare_model_for_kbit_training
|
||||
peft_config = self.config["fine_tune_strategies"].get(adapter_type, FineTuneConfig(adapter_type=t.cast("PeftType", adapter_type), llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config()
|
||||
peft_config = self.config["fine_tune_strategies"].get(adapter_type, FineTuneConfig(adapter_type=t.cast("PeftType", adapter_type), llm_config_class=self.config_class)).train().with_config(
|
||||
**attrs
|
||||
).to_peft_config()
|
||||
wrapped_peft = peft.get_peft_model(prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config)
|
||||
if DEBUG: wrapped_peft.print_trainable_parameters()
|
||||
return wrapped_peft, self.tokenizer
|
||||
@@ -846,7 +907,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
|
||||
# order of these fields matter here, make sure to sync it with
|
||||
# openllm.models.auto.factory.BaseAutoLLMClass.for_model
|
||||
def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
|
||||
def to_runner(
|
||||
self,
|
||||
models: list[bentoml.Model] | None = None,
|
||||
max_batch_size: int | None = None,
|
||||
max_latency_ms: int | None = None,
|
||||
scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy
|
||||
) -> LLMRunner[M, T]:
|
||||
"""Convert this LLM into a Runner.
|
||||
|
||||
Args:
|
||||
@@ -879,7 +946,18 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
|
||||
|
||||
# NOTE: returning the two langchain API's to the runner
|
||||
return llm_runner_class(self)(llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), name=self.runner_name, embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms, method_configs=bentoml_cattr.unstructure({"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig}), scheduling_strategy=scheduling_strategy,)
|
||||
return llm_runner_class(self)(
|
||||
llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig),
|
||||
name=self.runner_name,
|
||||
embedded=False,
|
||||
models=models,
|
||||
max_batch_size=max_batch_size,
|
||||
max_latency_ms=max_latency_ms,
|
||||
method_configs=bentoml_cattr.unstructure({
|
||||
"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig
|
||||
}),
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
)
|
||||
|
||||
# NOTE: Scikit API
|
||||
def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
|
||||
@@ -908,7 +986,18 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
pass
|
||||
return [it]
|
||||
|
||||
def generate_iterator(self, prompt: str, /, *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]:
|
||||
def generate_iterator(
|
||||
self,
|
||||
prompt: str,
|
||||
/,
|
||||
*,
|
||||
context_length: int | None = None,
|
||||
echo: bool = True,
|
||||
stream_interval: int = 2,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
**attrs: t.Any
|
||||
) -> t.Iterator[t.Any]:
|
||||
# NOTE: encoder-decoder models will need to implement their own generate_iterator for now
|
||||
# inspired from fastchat's generate_stream_func
|
||||
from ._generation import prepare_logits_processor, get_context_length, is_partial_stop
|
||||
@@ -937,7 +1026,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
logits = out.logits
|
||||
past_key_values = out.past_key_values
|
||||
|
||||
last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device) if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :]
|
||||
last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device)
|
||||
if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :]
|
||||
# Switch to CPU by avoiding some bugs in mps backend.
|
||||
if self.device.type == "mps": last_token_logits = last_token_logits.float().to("cpu")
|
||||
|
||||
|
||||
@@ -24,14 +24,27 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
|
||||
int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None)
|
||||
int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False)
|
||||
|
||||
autogptq_attrs: DictStrAny = {"bits": attrs.pop("gptq_bits", 4), "group_size": attrs.pop("gptq_group_size", -1), "damp_percent": attrs.pop("gptq_damp_percent", 0.01), "desc_act": attrs.pop("gptq_desc_act", True), "sym": attrs.pop("gptq_sym", True), "true_sequential": attrs.pop("gptq_true_sequential", True),}
|
||||
autogptq_attrs: DictStrAny = {
|
||||
"bits": attrs.pop("gptq_bits", 4),
|
||||
"group_size": attrs.pop("gptq_group_size", -1),
|
||||
"damp_percent": attrs.pop("gptq_damp_percent", 0.01),
|
||||
"desc_act": attrs.pop("gptq_desc_act", True),
|
||||
"sym": attrs.pop("gptq_sym", True),
|
||||
"true_sequential": attrs.pop("gptq_true_sequential", True),
|
||||
}
|
||||
|
||||
def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
|
||||
if int8_skip_modules is None: int8_skip_modules = []
|
||||
if "lm_head" not in int8_skip_modules and cls.config_class.__openllm_model_type__ == "causal_lm":
|
||||
logger.debug("Skipping 'lm_head' for quantization for %s", cls.__name__)
|
||||
int8_skip_modules.append("lm_head")
|
||||
return transformers.BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload, llm_int8_threshhold=int8_threshold, llm_int8_skip_modules=int8_skip_modules, llm_int8_has_fp16_weight=int8_has_fp16_weight,)
|
||||
return transformers.BitsAndBytesConfig(
|
||||
load_in_8bit=True,
|
||||
llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload,
|
||||
llm_int8_threshhold=int8_threshold,
|
||||
llm_int8_skip_modules=int8_skip_modules,
|
||||
llm_int8_has_fp16_weight=int8_has_fp16_weight,
|
||||
)
|
||||
|
||||
# 4 bit configuration
|
||||
int4_compute_dtype = attrs.pop("bnb_4bit_compute_dtype", torch.bfloat16)
|
||||
@@ -44,13 +57,21 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
|
||||
if not is_bitsandbytes_available(): raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
|
||||
if quantise == "int8": quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == "int4":
|
||||
if is_transformers_supports_kbit(): quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant)
|
||||
if is_transformers_supports_kbit():
|
||||
quantisation_config = transformers.BitsAndBytesConfig(
|
||||
load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant
|
||||
)
|
||||
else:
|
||||
logger.warning("'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.", pkg.pkg_version_info("transformers"))
|
||||
logger.warning(
|
||||
"'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
|
||||
pkg.pkg_version_info("transformers")
|
||||
)
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
elif quantise == "gptq":
|
||||
if not is_autogptq_available():
|
||||
logger.warning("'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes.")
|
||||
logger.warning(
|
||||
"'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
|
||||
)
|
||||
quantisation_config = create_int8_config(int8_skip_modules)
|
||||
else:
|
||||
quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract"
|
||||
# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract,type-arg,valid-type,arg-type"
|
||||
from __future__ import annotations
|
||||
import os, warnings, orjson, bentoml, openllm, openllm_core, typing as t
|
||||
from starlette.applications import Starlette
|
||||
from starlette.responses import JSONResponse
|
||||
from starlette.routing import Route
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import TypeAlias
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
from bentoml._internal.runner.runner import RunnerMethod, AbstractRunner
|
||||
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.LLMEmbeddings]]
|
||||
# The following warnings from bitsandbytes, and probably not that important for users to see
|
||||
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
|
||||
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
|
||||
@@ -16,7 +18,13 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na
|
||||
adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""") # openllm: model adapter map
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
|
||||
generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm_core.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300) # type: ignore[arg-type] # XXX: remove once bentoml.Runner is correct set with type.
|
||||
generic_embedding_runner = bentoml.Runner(
|
||||
openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type
|
||||
name="llm-generic-embedding",
|
||||
scheduling_strategy=openllm_core.CascadingResourceStrategy,
|
||||
max_batch_size=32,
|
||||
max_latency_ms=300
|
||||
)
|
||||
runners: list[AbstractRunner] = [runner]
|
||||
if not runner.supports_embeddings: runners.append(generic_embedding_runner)
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
|
||||
@@ -31,9 +39,29 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
|
||||
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **qa_inputs.llm_config.model_dump())
|
||||
@svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent}))
|
||||
@svc.api(
|
||||
route="/v1/metadata",
|
||||
input=bentoml.io.Text(),
|
||||
output=bentoml.io.JSON.from_sample({
|
||||
"model_id": runner.llm.model_id,
|
||||
"timeout": 3600,
|
||||
"model_name": llm_config["model_name"],
|
||||
"framework": "pt",
|
||||
"configuration": "",
|
||||
"supports_embeddings": runner.supports_embeddings,
|
||||
"supports_hf_agent": runner.supports_hf_agent
|
||||
})
|
||||
)
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)
|
||||
return openllm.MetadataOutput(
|
||||
timeout=llm_config["timeout"],
|
||||
model_name=llm_config["model_name"],
|
||||
framework=llm_config["env"]["framework_value"],
|
||||
model_id=runner.llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
supports_embeddings=runner.supports_embeddings,
|
||||
supports_hf_agent=runner.supports_hf_agent
|
||||
)
|
||||
@svc.api(
|
||||
route="/v1/embeddings",
|
||||
input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]),
|
||||
@@ -70,7 +98,7 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
})
|
||||
)
|
||||
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
|
||||
embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type]
|
||||
embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type]
|
||||
responses = (await embed_call.async_run(phrases))[0]
|
||||
return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"])
|
||||
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
|
||||
|
||||
@@ -5,7 +5,10 @@ These utilities will stay internal, and its API can be changed or updated withou
|
||||
from __future__ import annotations
|
||||
import os, typing as t
|
||||
from openllm_core.utils import LazyModule
|
||||
_import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]}
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"],
|
||||
"oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]
|
||||
}
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from . import _package as _package, oci as oci
|
||||
|
||||
@@ -54,7 +54,18 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
packages.extend([importlib.metadata.version("flax"), importlib.metadata.version("jax"), importlib.metadata.version("jaxlib")])
|
||||
elif framework_envvar == "tf":
|
||||
if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
|
||||
candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos",)
|
||||
candidates = (
|
||||
"tensorflow",
|
||||
"tensorflow-cpu",
|
||||
"tensorflow-gpu",
|
||||
"tf-nightly",
|
||||
"tf-nightly-cpu",
|
||||
"tf-nightly-gpu",
|
||||
"intel-tensorflow",
|
||||
"intel-tensorflow-avx512",
|
||||
"tensorflow-rocm",
|
||||
"tensorflow-macos",
|
||||
)
|
||||
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
|
||||
for candidate in candidates:
|
||||
try:
|
||||
@@ -70,15 +81,39 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
|
||||
if not openllm_core.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
|
||||
packages.extend([f'torch>={importlib.metadata.version("torch")}'])
|
||||
wheels: list[str] = []
|
||||
built_wheels: list[str | None] = [build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")]
|
||||
built_wheels: list[str | None] = [
|
||||
build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")
|
||||
]
|
||||
if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
|
||||
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
|
||||
def construct_docker_options(
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
_: FS,
|
||||
workers_per_resource: float,
|
||||
quantize: LiteralString | None,
|
||||
bettertransformer: bool | None,
|
||||
adapter_map: dict[str, str | None] | None,
|
||||
dockerfile_template: str | None,
|
||||
runtime: t.Literal["ggml", "transformers"],
|
||||
serialisation_format: t.Literal["safetensors", "legacy"],
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy
|
||||
) -> DockerOptions:
|
||||
from openllm.cli._factory import parse_config_options
|
||||
environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
|
||||
env: openllm_core.utils.EnvVarMixin = llm.config["env"]
|
||||
if env["framework_value"] == "vllm": serialisation_format = "legacy"
|
||||
env_dict = {env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",}
|
||||
env_dict = {
|
||||
env.framework: env["framework_value"],
|
||||
env.config: f"'{llm.config.model_dump_json().decode()}'",
|
||||
env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}",
|
||||
"OPENLLM_MODEL": llm.config["model_name"],
|
||||
"OPENLLM_SERIALIZATION": serialisation_format,
|
||||
"OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'",
|
||||
"BENTOML_DEBUG": str(True),
|
||||
"BENTOML_QUIET": str(False),
|
||||
"BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",
|
||||
}
|
||||
if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
|
||||
|
||||
# We need to handle None separately here, as env from subprocess doesn't accept None value.
|
||||
@@ -120,7 +155,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N
|
||||
src_contents = f.readlines()
|
||||
for it in src_contents:
|
||||
if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
|
||||
elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
|
||||
elif OPENLLM_MODEL_ADAPTER_MAP in it:
|
||||
src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
|
||||
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
|
||||
if DEBUG: logger.info("Generated script:\n%s", script)
|
||||
llm_fs.writetext(llm.config["service_name"], script)
|
||||
@@ -170,7 +206,9 @@ def create_bento(
|
||||
exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"],
|
||||
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
|
||||
models=[llm_spec],
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy)
|
||||
docker=construct_docker_options(
|
||||
llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
|
||||
)
|
||||
)
|
||||
|
||||
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/"))
|
||||
|
||||
@@ -45,7 +45,9 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
|
||||
# If docker is not found, then fallback to previous behaviour. Which the container might not exists.
|
||||
docker_bin = shutil.which("docker")
|
||||
if docker_bin is None:
|
||||
logger.warning("To get the correct available nightly container, make sure to have docker available. Fallback to previous behaviour for determine nightly hash (container might not exists due to the lack of GPU machine at a time. See https://github.com/bentoml/OpenLLM/pkgs/container/openllm for available image.)")
|
||||
logger.warning(
|
||||
"To get the correct available nightly container, make sure to have docker available. Fallback to previous behaviour for determine nightly hash (container might not exists due to the lack of GPU machine at a time. See https://github.com/bentoml/OpenLLM/pkgs/container/openllm for available image.)"
|
||||
)
|
||||
commits = t.cast("list[dict[str, t.Any]]", cls._ghapi.repos.list_commits(since=_commit_time_range()))
|
||||
return next(f'sha-{it["sha"][:7]}' for it in commits if "[skip ci]" not in it["commit"]["message"])
|
||||
# now is the correct behaviour
|
||||
@@ -71,7 +73,8 @@ class RefResolver:
|
||||
version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str)
|
||||
else:
|
||||
version = ("", version_str)
|
||||
if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
|
||||
if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12):
|
||||
raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
|
||||
return _RefTuple((*version, "release" if _use_base_strategy else "custom"))
|
||||
|
||||
@classmethod
|
||||
@@ -96,7 +99,12 @@ class RefResolver:
|
||||
@functools.lru_cache(maxsize=256)
|
||||
def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
|
||||
return RefResolver.from_strategy(strategy).tag
|
||||
def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, version_strategy: LiteralContainerVersionStrategy = "release", push: bool = False, machine: bool = False) -> dict[str | LiteralContainerRegistry, str]:
|
||||
def build_container(
|
||||
registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
|
||||
version_strategy: LiteralContainerVersionStrategy = "release",
|
||||
push: bool = False,
|
||||
machine: bool = False
|
||||
) -> dict[str | LiteralContainerRegistry, str]:
|
||||
try:
|
||||
if not _BUILDER.health(): raise openllm.exceptions.Error
|
||||
except (openllm.exceptions.Error, subprocess.CalledProcessError):
|
||||
@@ -106,12 +114,22 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
|
||||
if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
|
||||
pyproject_path = pathlib.Path(_module_location).parent.parent / "pyproject.toml"
|
||||
if not pyproject_path.exists(): raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
|
||||
if not registries: tags: dict[str | LiteralContainerRegistry, str] = {alias: f"{value}:{get_base_container_tag(version_strategy)}" for alias, value in _CONTAINER_REGISTRY.items()} # default to all registries with latest tag strategy
|
||||
if not registries:
|
||||
tags: dict[str | LiteralContainerRegistry, str] = {
|
||||
alias: f"{value}:{get_base_container_tag(version_strategy)}" for alias, value in _CONTAINER_REGISTRY.items()
|
||||
} # default to all registries with latest tag strategy
|
||||
else:
|
||||
registries = [registries] if isinstance(registries, str) else list(registries)
|
||||
tags = {name: f"{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}" for name in registries}
|
||||
try:
|
||||
outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm_core.utils.get_debug_mode() else "auto", quiet=machine)
|
||||
outputs = _BUILDER.build(
|
||||
file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(),
|
||||
context_path=pyproject_path.parent.__fspath__(),
|
||||
tag=tuple(tags.values()),
|
||||
push=push,
|
||||
progress="plain" if openllm_core.utils.get_debug_mode() else "auto",
|
||||
quiet=machine
|
||||
)
|
||||
if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip()
|
||||
except Exception as err:
|
||||
raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err
|
||||
|
||||
@@ -25,7 +25,12 @@ def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
|
||||
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
|
||||
# TODO: Support amd.com/gpu on k8s
|
||||
_bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
|
||||
_bentoml_config_options_opts = ["tracing.sample_rate=1.0", f"api_server.traffic.timeout={server_timeout}", f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}']
|
||||
_bentoml_config_options_opts = [
|
||||
"tracing.sample_rate=1.0",
|
||||
f"api_server.traffic.timeout={server_timeout}",
|
||||
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
|
||||
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
|
||||
]
|
||||
if device:
|
||||
if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
|
||||
else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
|
||||
@@ -81,17 +86,38 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
if llm_config["requires_gpu"] and openllm.utils.device_count() < 1:
|
||||
# NOTE: The model requires GPU, therefore we will return a dummy command
|
||||
command_attrs.update({"short_help": "(Disabled because there is no GPU available)", "help": f"{model} is currently not available to run on your local machine because it requires GPU for inference."})
|
||||
command_attrs.update({
|
||||
"short_help": "(Disabled because there is no GPU available)", "help": f"{model} is currently not available to run on your local machine because it requires GPU for inference."
|
||||
})
|
||||
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
|
||||
|
||||
@group.command(**command_attrs)
|
||||
@start_decorator(llm_config, serve_grpc=_serve_grpc)
|
||||
@click.pass_context
|
||||
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool, serialisation_format: t.Literal["safetensors", "legacy"], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
def start_cmd(
|
||||
ctx: click.Context,
|
||||
/,
|
||||
server_timeout: int,
|
||||
model_id: str | None,
|
||||
model_version: str | None,
|
||||
workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString,
|
||||
device: t.Tuple[str, ...],
|
||||
quantize: t.Literal["int8", "int4", "gptq"] | None,
|
||||
bettertransformer: bool | None,
|
||||
runtime: t.Literal["ggml", "transformers"],
|
||||
fast: bool,
|
||||
serialisation_format: t.Literal["safetensors", "legacy"],
|
||||
cors: bool,
|
||||
adapter_id: str | None,
|
||||
return_process: bool,
|
||||
**attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
|
||||
if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
|
||||
termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow")
|
||||
termui.echo(
|
||||
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
|
||||
fg="yellow"
|
||||
)
|
||||
adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
|
||||
config, server_attrs = llm_config.model_validate_click(**attrs)
|
||||
server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"])
|
||||
@@ -117,21 +143,34 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
wpr = float(wpr)
|
||||
|
||||
# Create a new model env to work with the envvar during CLI invocation
|
||||
env = openllm.utils.EnvVarMixin(config["model_name"], config.default_implementation(), model_id=model_id or config["default_id"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
|
||||
env = openllm.utils.EnvVarMixin(
|
||||
config["model_name"], config.default_implementation(), model_id=model_id or config["default_id"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
|
||||
)
|
||||
prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
|
||||
|
||||
# NOTE: This is to set current configuration
|
||||
start_env = os.environ.copy()
|
||||
start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
|
||||
if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
|
||||
if fast:
|
||||
termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow")
|
||||
|
||||
start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]})
|
||||
start_env.update({
|
||||
"OPENLLM_MODEL": model,
|
||||
"BENTOML_DEBUG": str(openllm.utils.get_debug_mode()),
|
||||
"BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()),
|
||||
"OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(),
|
||||
"OPENLLM_SERIALIZATION": serialisation_format,
|
||||
env.runtime: env["runtime_value"],
|
||||
env.framework: env["framework_value"]
|
||||
})
|
||||
if env["model_id_value"]: start_env[env.model_id] = str(env["model_id_value"])
|
||||
# NOTE: quantize and bettertransformer value is already assigned within env
|
||||
if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"])
|
||||
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"]))
|
||||
|
||||
llm = openllm.utils.infer_auto_class(env["framework_value"]).for_model(model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format)
|
||||
llm = openllm.utils.infer_auto_class(env["framework_value"]).for_model(
|
||||
model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format
|
||||
)
|
||||
start_env.update({env.config: llm.config.model_dump_json().decode()})
|
||||
|
||||
server = bentoml.GrpcServer("_service:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service:svc", **server_attrs)
|
||||
@@ -174,7 +213,8 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
|
||||
return noop
|
||||
def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
|
||||
if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
|
||||
if quantize and llm_config.default_implementation() == "vllm": ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
|
||||
if quantize and llm_config.default_implementation() == "vllm":
|
||||
ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
|
||||
requirements = llm_config["requirements"]
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
@@ -204,12 +244,22 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
|
||||
""",
|
||||
),
|
||||
cog.optgroup.option("--device", type=openllm.utils.dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", show_envvar=True),
|
||||
cog.optgroup.option(
|
||||
"--device",
|
||||
type=openllm.utils.dantic.CUDA,
|
||||
multiple=True,
|
||||
envvar="CUDA_VISIBLE_DEVICES",
|
||||
callback=parse_device_callback,
|
||||
help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
|
||||
show_envvar=True
|
||||
),
|
||||
cog.optgroup.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers."),
|
||||
quantize_option(factory=cog.optgroup, model_env=llm_config["env"]),
|
||||
bettertransformer_option(factory=cog.optgroup, model_env=llm_config["env"]),
|
||||
serialisation_option(factory=cog.optgroup),
|
||||
cog.optgroup.group("Fine-tuning related options", help="""\
|
||||
cog.optgroup.group(
|
||||
"Fine-tuning related options",
|
||||
help="""\
|
||||
Note that the argument `--adapter-id` can accept the following format:
|
||||
|
||||
- `--adapter-id /path/to/adapter` (local adapter)
|
||||
@@ -223,8 +273,16 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
$ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora
|
||||
|
||||
```
|
||||
"""),
|
||||
cog.optgroup.option("--adapter-id", default=None, help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'", multiple=True, callback=_id_callback, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"),
|
||||
"""
|
||||
),
|
||||
cog.optgroup.option(
|
||||
"--adapter-id",
|
||||
default=None,
|
||||
help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'",
|
||||
multiple=True,
|
||||
callback=_id_callback,
|
||||
metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"
|
||||
),
|
||||
click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True),
|
||||
)
|
||||
return composed(fn)
|
||||
@@ -246,7 +304,9 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
|
||||
from bentoml_cli.cli import cli
|
||||
|
||||
command = "serve" if not serve_grpc else "serve-grpc"
|
||||
group = cog.optgroup.group(f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",)
|
||||
group = cog.optgroup.group(
|
||||
f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
|
||||
)
|
||||
|
||||
def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
|
||||
serve_command = cli.commands[command]
|
||||
@@ -291,18 +351,46 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput
|
||||
def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
|
||||
return [CompletionItem(it) for it in output]
|
||||
|
||||
return cli_option("-o", "--output", "output", type=click.Choice(output), default=default_value, help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True, shell_complete=complete_output_var, **attrs)(f)
|
||||
return cli_option(
|
||||
"-o",
|
||||
"--output",
|
||||
"output",
|
||||
type=click.Choice(output),
|
||||
default=default_value,
|
||||
help="Showing output type.",
|
||||
show_default=True,
|
||||
envvar="OPENLLM_OUTPUT",
|
||||
show_envvar=True,
|
||||
shell_complete=complete_output_var,
|
||||
**attrs
|
||||
)(f)
|
||||
def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option("--fast/--no-fast", show_default=True, default=False, envvar="OPENLLM_USE_LOCAL_LATEST", show_envvar=True, help="""Whether to skip checking if models is already in store.
|
||||
return cli_option(
|
||||
"--fast/--no-fast",
|
||||
show_default=True,
|
||||
default=False,
|
||||
envvar="OPENLLM_USE_LOCAL_LATEST",
|
||||
show_envvar=True,
|
||||
help="""Whether to skip checking if models is already in store.
|
||||
|
||||
This is useful if you already downloaded or setup the model beforehand.
|
||||
""", **attrs)(f)
|
||||
""",
|
||||
**attrs
|
||||
)(f)
|
||||
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f)
|
||||
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
|
||||
def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
|
||||
return cli_option(
|
||||
"--model-id",
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
envvar=model_env.model_id if model_env is not None else None,
|
||||
show_envvar=model_env is not None,
|
||||
help="Optional model_id name or path for (fine-tune) weight.",
|
||||
**attrs
|
||||
)(f)
|
||||
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
|
||||
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
@@ -349,14 +437,25 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
|
||||
- ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
|
||||
- ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
|
||||
""" + ("""\n
|
||||
""" + (
|
||||
"""\n
|
||||
> [!NOTE] The workers value passed into 'build' will determine how the LLM can
|
||||
> be provisioned in Kubernetes as well as in standalone container. This will
|
||||
> ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ""),
|
||||
> ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ""
|
||||
),
|
||||
**attrs
|
||||
)(f)
|
||||
def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option("--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs)(f)
|
||||
return cli_option(
|
||||
"--bettertransformer",
|
||||
is_flag=True,
|
||||
default=None,
|
||||
envvar=model_env.bettertransformer if model_env is not None else None,
|
||||
show_envvar=model_env is not None,
|
||||
help="Apply FasterTransformer wrapper to serve model. This will applies during serving time."
|
||||
if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.",
|
||||
**attrs
|
||||
)(f)
|
||||
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option(
|
||||
"--serialisation",
|
||||
|
||||
@@ -71,7 +71,14 @@ def _start(
|
||||
"""
|
||||
from .entrypoint import start_command, start_grpc_command
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name, openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
|
||||
_ModelEnv = openllm_core.utils.EnvVarMixin(
|
||||
model_name,
|
||||
openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()),
|
||||
model_id=model_id,
|
||||
bettertransformer=bettertransformer,
|
||||
quantize=quantize,
|
||||
runtime=runtime
|
||||
)
|
||||
os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
|
||||
|
||||
args: list[str] = ["--runtime", runtime]
|
||||
@@ -87,7 +94,9 @@ def _start(
|
||||
if additional_args: args.extend(additional_args)
|
||||
if __test__: args.append("--return-process")
|
||||
|
||||
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
|
||||
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
|
||||
args=args if len(args) > 0 else None, standalone_mode=False
|
||||
)
|
||||
@inject
|
||||
def _build(
|
||||
model_name: str,
|
||||
@@ -190,9 +199,21 @@ def _build(
|
||||
if e.stderr: raise OpenLLMException(e.stderr.decode("utf-8")) from None
|
||||
raise OpenLLMException(str(e)) from None
|
||||
matched = re.match(r"__tag__:([^:\n]+:[^:\n]+)$", output.decode("utf-8").strip())
|
||||
if matched is None: raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
|
||||
if matched is None:
|
||||
raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
|
||||
return bentoml.get(matched.group(1), _bento_store=bento_store)
|
||||
def _import_model(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", implementation: LiteralRuntime = "pt", quantize: t.Literal["int8", "int4", "gptq"] | None = None, serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors", additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
|
||||
def _import_model(
|
||||
model_name: str,
|
||||
/,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
model_version: str | None = None,
|
||||
runtime: t.Literal["ggml", "transformers"] = "transformers",
|
||||
implementation: LiteralRuntime = "pt",
|
||||
quantize: t.Literal["int8", "int4", "gptq"] | None = None,
|
||||
serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors",
|
||||
additional_args: t.Sequence[str] | None = None
|
||||
) -> bentoml.Model:
|
||||
"""Import a LLM into local store.
|
||||
|
||||
> [!NOTE]
|
||||
|
||||
@@ -225,7 +225,9 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
with formatter.section(_("Extensions")):
|
||||
formatter.write_dl(rows)
|
||||
@click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="openllm")
|
||||
@click.version_option(None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}")
|
||||
@click.version_option(
|
||||
None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}"
|
||||
)
|
||||
def cli() -> None:
|
||||
"""\b
|
||||
██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗
|
||||
@@ -257,7 +259,14 @@ def start_grpc_command() -> None:
|
||||
$ openllm start-grpc <model_name> --<options> ...
|
||||
```
|
||||
"""
|
||||
_start_mapping = {"start": {key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING}, "start-grpc": {key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING}}
|
||||
_start_mapping = {
|
||||
"start": {
|
||||
key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING
|
||||
},
|
||||
"start-grpc": {
|
||||
key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING
|
||||
}
|
||||
}
|
||||
@cli.command(name="import", aliases=["download"])
|
||||
@model_name_argument
|
||||
@click.argument("model_id", type=click.STRING, default=None, metavar="Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]", required=False)
|
||||
@@ -269,7 +278,18 @@ _start_mapping = {"start": {key: start_command_factory(start_command, key, _cont
|
||||
@machine_option
|
||||
@click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, help="The implementation for saving this LLM.")
|
||||
@serialisation_option
|
||||
def import_command(model_name: str, model_id: str | None, converter: str | None, model_version: str | None, output: LiteralOutput, runtime: t.Literal["ggml", "transformers"], machine: bool, implementation: LiteralRuntime | None, quantize: t.Literal["int8", "int4", "gptq"] | None, serialisation_format: t.Literal["safetensors", "legacy"],) -> bentoml.Model:
|
||||
def import_command(
|
||||
model_name: str,
|
||||
model_id: str | None,
|
||||
converter: str | None,
|
||||
model_version: str | None,
|
||||
output: LiteralOutput,
|
||||
runtime: t.Literal["ggml", "transformers"],
|
||||
machine: bool,
|
||||
implementation: LiteralRuntime | None,
|
||||
quantize: t.Literal["int8", "int4", "gptq"] | None,
|
||||
serialisation_format: t.Literal["safetensors", "legacy"],
|
||||
) -> bentoml.Model:
|
||||
"""Setup LLM interactively.
|
||||
|
||||
It accepts two positional arguments: `model_name` and `model_id`. The first name determine
|
||||
@@ -325,7 +345,9 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
llm_config = AutoConfig.for_model(model_name)
|
||||
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
|
||||
impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"])
|
||||
llm = infer_auto_class(impl).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format)
|
||||
llm = infer_auto_class(impl).for_model(
|
||||
model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
|
||||
)
|
||||
_previously_saved = False
|
||||
try:
|
||||
_ref = serialisation.get(llm)
|
||||
@@ -356,17 +378,37 @@ def import_command(model_name: str, model_id: str | None, converter: str | None,
|
||||
@quantize_option(factory=cog.optgroup, build=True)
|
||||
@bettertransformer_option(factory=cog.optgroup)
|
||||
@click.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers.")
|
||||
@click.option("--enable-features", multiple=True, nargs=1, metavar="FEATURE[,FEATURE]", help="Enable additional features for building this LLM Bento. Available: {}".format(", ".join(OPTIONAL_DEPENDENCIES)))
|
||||
@click.option("--adapter-id", default=None, multiple=True, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]", help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.")
|
||||
@click.option(
|
||||
"--enable-features",
|
||||
multiple=True,
|
||||
nargs=1,
|
||||
metavar="FEATURE[,FEATURE]",
|
||||
help="Enable additional features for building this LLM Bento. Available: {}".format(", ".join(OPTIONAL_DEPENDENCIES))
|
||||
)
|
||||
@click.option(
|
||||
"--adapter-id",
|
||||
default=None,
|
||||
multiple=True,
|
||||
metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]",
|
||||
help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed."
|
||||
)
|
||||
@click.option("--build-ctx", help="Build context. This is required if --adapter-id uses relative path", default=None)
|
||||
@model_version_option
|
||||
@click.option("--dockerfile-template", default=None, type=click.File(), help="Optional custom dockerfile template to be used with this BentoLLM.")
|
||||
@serialisation_option
|
||||
@container_registry_option
|
||||
@click.option("--container-version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="release", help="Default container version strategy for the image from '--container-registry'")
|
||||
@click.option(
|
||||
"--container-version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="release", help="Default container version strategy for the image from '--container-registry'"
|
||||
)
|
||||
@fast_option
|
||||
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Utilities options")
|
||||
@cog.optgroup.option("--containerize", default=False, is_flag=True, type=click.BOOL, help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.")
|
||||
@cog.optgroup.option(
|
||||
"--containerize",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
type=click.BOOL,
|
||||
help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'."
|
||||
)
|
||||
@cog.optgroup.option("--push", default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.")
|
||||
@click.option("--force-push", default=False, is_flag=True, type=click.BOOL, help="Whether to force push.")
|
||||
@click.pass_context
|
||||
@@ -431,7 +473,9 @@ def build_command(
|
||||
if env["quantize_value"]: os.environ[env.quantize] = str(env["quantize_value"])
|
||||
os.environ[env.bettertransformer] = str(env["bettertransformer_value"])
|
||||
|
||||
llm = infer_auto_class(env["framework_value"]).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs)
|
||||
llm = infer_auto_class(env["framework_value"]).for_model(
|
||||
model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
|
||||
)
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]})
|
||||
@@ -476,7 +520,20 @@ def build_command(
|
||||
raise bentoml.exceptions.NotFound(f"Rebuilding existing Bento {bento_tag}") from None
|
||||
_previously_built = True
|
||||
except bentoml.exceptions.NotFound:
|
||||
bento = bundle.create_bento(bento_tag, llm_fs, llm, workers_per_resource=workers_per_resource, adapter_map=adapter_map, quantize=quantize, bettertransformer=bettertransformer, extra_dependencies=enable_features, dockerfile_template=dockerfile_template_path, runtime=runtime, container_registry=container_registry, container_version_strategy=container_version_strategy)
|
||||
bento = bundle.create_bento(
|
||||
bento_tag,
|
||||
llm_fs,
|
||||
llm,
|
||||
workers_per_resource=workers_per_resource,
|
||||
adapter_map=adapter_map,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
extra_dependencies=enable_features,
|
||||
dockerfile_template=dockerfile_template_path,
|
||||
runtime=runtime,
|
||||
container_registry=container_registry,
|
||||
container_version_strategy=container_version_strategy
|
||||
)
|
||||
except Exception as err:
|
||||
raise err from None
|
||||
|
||||
@@ -486,7 +543,12 @@ def build_command(
|
||||
termui.echo("\n" + OPENLLM_FIGLET, fg="white")
|
||||
if not _previously_built: termui.echo(f"Successfully built {bento}.", fg="green")
|
||||
elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg="yellow")
|
||||
termui.echo("📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" + f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" + "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n", fg="blue",)
|
||||
termui.echo(
|
||||
"📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" +
|
||||
f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" +
|
||||
"\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n",
|
||||
fg="blue",
|
||||
)
|
||||
elif output == "json":
|
||||
termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode())
|
||||
else:
|
||||
@@ -538,7 +600,14 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
if config["model_name"] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ("flax",)
|
||||
if config["model_name"] in MODEL_TF_MAPPING_NAMES: runtime_impl += ("tf",)
|
||||
if config["model_name"] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ("vllm",)
|
||||
json_data[m] = {"architecture": config["architecture"], "model_id": config["model_ids"], "cpu": not config["requires_gpu"], "gpu": True, "runtime_impl": runtime_impl, "installation": f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config["requirements"] else "openllm",}
|
||||
json_data[m] = {
|
||||
"architecture": config["architecture"],
|
||||
"model_id": config["model_ids"],
|
||||
"cpu": not config["requires_gpu"],
|
||||
"gpu": True,
|
||||
"runtime_impl": runtime_impl,
|
||||
"installation": f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config["requirements"] else "openllm",
|
||||
}
|
||||
converted.extend([normalise_model_name(i) for i in config["model_ids"]])
|
||||
if DEBUG:
|
||||
try:
|
||||
@@ -546,7 +615,11 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
except Exception as e:
|
||||
failed_initialized.append((m, e))
|
||||
|
||||
ids_in_local_store = {k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k] for k in json_data.keys()}
|
||||
ids_in_local_store = {
|
||||
k: [
|
||||
i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k
|
||||
] for k in json_data.keys()
|
||||
}
|
||||
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
|
||||
local_models: DictStrAny | None = None
|
||||
if show_available:
|
||||
@@ -563,7 +636,9 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
|
||||
for m, v in json_data.items():
|
||||
data.extend([(m, v["architecture"], v["model_id"], v["installation"], "❌" if not v["cpu"] else "✅", "✅", v["runtime_impl"],)])
|
||||
column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),]
|
||||
column_widths = [
|
||||
int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
|
||||
]
|
||||
|
||||
if len(data) == 0 and len(failed_initialized) > 0:
|
||||
termui.echo("Exception found while parsing models:\n", fg="yellow")
|
||||
@@ -596,17 +671,22 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
@click.option("-y", "--yes", "--assume-yes", is_flag=True, help="Skip confirmation when deleting a specific model")
|
||||
@click.option("--include-bentos/--no-include-bentos", is_flag=True, default=False, help="Whether to also include pruning bentos.")
|
||||
@inject
|
||||
def prune_command(model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> None:
|
||||
def prune_command(
|
||||
model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
|
||||
) -> None:
|
||||
"""Remove all saved models, (and optionally bentos) built with OpenLLM locally.
|
||||
|
||||
\b
|
||||
If a model type is passed, then only prune models for that given model type.
|
||||
"""
|
||||
available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"]
|
||||
available: list[tuple[bentoml.Model | bentoml.Bento,
|
||||
ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"]
|
||||
if model_name is not None: available = [(m, store) for m, store in available if "model_name" in m.info.labels and m.info.labels["model_name"] == inflection.underscore(model_name)]
|
||||
if include_bentos:
|
||||
if model_name is not None: available += [(b, bento_store) for b in bentoml.bentos.list() if "start_name" in b.info.labels and b.info.labels["start_name"] == inflection.underscore(model_name)]
|
||||
else: available += [(b, bento_store) for b in bentoml.bentos.list() if "_type" in b.info.labels and "_framework" in b.info.labels]
|
||||
if model_name is not None:
|
||||
available += [(b, bento_store) for b in bentoml.bentos.list() if "start_name" in b.info.labels and b.info.labels["start_name"] == inflection.underscore(model_name)]
|
||||
else:
|
||||
available += [(b, bento_store) for b in bentoml.bentos.list() if "_type" in b.info.labels and "_framework" in b.info.labels]
|
||||
|
||||
for store_item, store in available:
|
||||
if yes: delete_confirmed = True
|
||||
@@ -633,15 +713,27 @@ def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, val
|
||||
else:
|
||||
raise click.BadParameter(f"Invalid option format: {value}")
|
||||
def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal["json", "porcelain", "pretty"] = "pretty") -> t.Callable[[FC], FC]:
|
||||
options = [click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000",), click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True), output_option(default_value=output_value),]
|
||||
options = [
|
||||
click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000",
|
||||
),
|
||||
click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True),
|
||||
output_option(default_value=output_value),
|
||||
]
|
||||
return compose(*options)(f) if f is not None else compose(*options)
|
||||
@cli.command()
|
||||
@click.argument("task", type=click.STRING, metavar="TASK")
|
||||
@shared_client_options
|
||||
@click.option("--agent", type=click.Choice(["hf"]), default="hf", help="Whether to interact with Agents from given Server endpoint.", show_default=True)
|
||||
@click.option("--remote", is_flag=True, default=False, help="Whether or not to use remote tools (inference endpoints) instead of local ones.", show_default=True)
|
||||
@click.option("--opt", help="Define prompt options. "
|
||||
"(format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
|
||||
@click.option(
|
||||
"--opt",
|
||||
help="Define prompt options. "
|
||||
"(format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)",
|
||||
required=False,
|
||||
multiple=True,
|
||||
callback=opt_callback,
|
||||
metavar="ARG=VALUE[,ARG=VALUE]"
|
||||
)
|
||||
def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: LiteralOutput, remote: bool, task: str, _memoized: DictStrAny, **attrs: t.Any) -> str:
|
||||
"""Instruct agents interactively for given tasks, from a terminal.
|
||||
|
||||
@@ -675,7 +767,9 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
|
||||
@click.argument("text", type=click.STRING, nargs=-1)
|
||||
@machine_option
|
||||
@click.pass_context
|
||||
def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, machine: bool) -> EmbeddingsOutput | None:
|
||||
def embed_command(
|
||||
ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, machine: bool
|
||||
) -> EmbeddingsOutput | None:
|
||||
"""Get embeddings interactively, from a terminal.
|
||||
|
||||
\b
|
||||
@@ -703,9 +797,13 @@ def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, time
|
||||
@shared_client_options
|
||||
@click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True)
|
||||
@click.argument("prompt", type=click.STRING)
|
||||
@click.option("--sampling-params", help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
|
||||
@click.option(
|
||||
"--sampling-params", help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]"
|
||||
)
|
||||
@click.pass_context
|
||||
def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any) -> None:
|
||||
def query_command(
|
||||
ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any
|
||||
) -> None:
|
||||
"""Ask a LLM interactively, from a terminal.
|
||||
|
||||
\b
|
||||
|
||||
@@ -31,5 +31,6 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
|
||||
# save it to /env/docker/Dockerfile.template. This is necessary
|
||||
# for the reconstruction of the Dockerfile.
|
||||
if "dockerfile_template" in docker_attrs and docker_attrs["dockerfile_template"] is not None: docker_attrs["dockerfile_template"] = "env/docker/Dockerfile.template"
|
||||
termui.echo(generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True,), fg="white")
|
||||
doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
|
||||
termui.echo(doc, fg="white")
|
||||
return bentomodel.path
|
||||
|
||||
@@ -11,7 +11,14 @@ LiteralOutput = t.Literal["json", "pretty", "porcelain"]
|
||||
@output_option
|
||||
@click.option("--format", type=click.STRING, default=None)
|
||||
@machine_option
|
||||
@click.option("--opt", help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
|
||||
@click.option(
|
||||
"--opt",
|
||||
help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
|
||||
required=False,
|
||||
multiple=True,
|
||||
callback=opt_callback,
|
||||
metavar="ARG=VALUE[,ARG=VALUE]"
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
|
||||
"""Get the default prompt used by OpenLLM."""
|
||||
|
||||
@@ -8,12 +8,25 @@ from openllm.cli._factory import LiteralOutput, output_option
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, output: LiteralOutput) -> None:
|
||||
"""List available bentos built by OpenLLM."""
|
||||
mapping = {k: [{"tag": str(b.tag), "size": human_readable_size(openllm.utils.calc_dir_size(b.path)), "models": [{"tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]} for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())}
|
||||
mapping = {
|
||||
k: [{
|
||||
"tag": str(b.tag),
|
||||
"size": human_readable_size(openllm.utils.calc_dir_size(b.path)),
|
||||
"models": [{
|
||||
"tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))
|
||||
} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
|
||||
} for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(
|
||||
inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()
|
||||
)
|
||||
}
|
||||
mapping = {k: v for k, v in mapping.items() if v}
|
||||
if output == "pretty":
|
||||
import tabulate
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
termui.echo(tabulate.tabulate([(k, i["tag"], i["size"], [_["tag"] for _ in i["models"]]) for k, v in mapping.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size", "Models"]), fg="white")
|
||||
termui.echo(
|
||||
tabulate.tabulate([(k, i["tag"], i["size"], [_["tag"] for _ in i["models"]]) for k, v in mapping.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size", "Models"]),
|
||||
fg="white"
|
||||
)
|
||||
else:
|
||||
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
|
||||
ctx.exit(0)
|
||||
|
||||
@@ -11,8 +11,12 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
|
||||
def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
|
||||
"""This is equivalent to openllm models --show-available less the nice table."""
|
||||
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
|
||||
ids_in_local_store = {k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k] for k in models}
|
||||
if model_name is not None: ids_in_local_store = {k: [i for i in v if "model_name" in i.info.labels and i.info.labels["model_name"] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
|
||||
ids_in_local_store = {
|
||||
k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k
|
||||
] for k in models
|
||||
}
|
||||
if model_name is not None:
|
||||
ids_in_local_store = {k: [i for i in v if "model_name" in i.info.labels and i.info.labels["model_name"] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
|
||||
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
|
||||
local_models = {k: [{"tag": str(i.tag), "size": human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
|
||||
if output == "pretty":
|
||||
|
||||
@@ -12,7 +12,7 @@ client.embed("What is the difference between gather and scatter?")
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import openllm_client, typing as t
|
||||
if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
|
||||
if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
|
||||
def __dir__() -> t.Sequence[str]:
|
||||
return sorted(dir(openllm_client))
|
||||
def __getattr__(it: str) -> t.Any:
|
||||
|
||||
@@ -3,7 +3,12 @@ import typing as t, os
|
||||
import openllm
|
||||
from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
|
||||
from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
|
||||
_import_structure: dict[str, list[str]] = {"modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
|
||||
_import_structure: dict[str, list[str]] = {
|
||||
"modeling_auto": ["MODEL_MAPPING_NAMES"],
|
||||
"modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"],
|
||||
"modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"],
|
||||
"modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]
|
||||
}
|
||||
if t.TYPE_CHECKING:
|
||||
from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
|
||||
from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
|
||||
|
||||
@@ -23,7 +23,8 @@ class BaseAutoLLMClass:
|
||||
raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
|
||||
|
||||
@classmethod
|
||||
def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False, **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
|
||||
def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False,
|
||||
**attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
|
||||
"""The lower level API for creating a LLM instance.
|
||||
|
||||
```python
|
||||
@@ -62,14 +63,18 @@ class BaseAutoLLMClass:
|
||||
llm_class: The runnable to register.
|
||||
"""
|
||||
if hasattr(llm_class, "config_class") and llm_class.config_class is not config_class:
|
||||
raise ValueError(f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!")
|
||||
raise ValueError(
|
||||
f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!"
|
||||
)
|
||||
cls._model_mapping.register(config_class, llm_class)
|
||||
|
||||
@classmethod
|
||||
def infer_class_from_name(cls, name: str) -> type[openllm.LLM[t.Any, t.Any]]:
|
||||
config_class = openllm.AutoConfig.infer_class_from_name(name)
|
||||
if config_class in cls._model_mapping: return cls._model_mapping[config_class]
|
||||
raise ValueError(f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])}).")
|
||||
raise ValueError(
|
||||
f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])})."
|
||||
)
|
||||
def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
|
||||
if attr is None: return
|
||||
if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr)
|
||||
@@ -127,13 +132,23 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
|
||||
return bool(self.keys())
|
||||
|
||||
def keys(self) -> ConfigModelKeysView:
|
||||
return t.cast("ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys()))
|
||||
return t.cast(
|
||||
"ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())
|
||||
)
|
||||
|
||||
def values(self) -> ConfigModelValuesView:
|
||||
return t.cast("ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values()))
|
||||
return t.cast(
|
||||
"ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(
|
||||
self._extra_content.values()
|
||||
)
|
||||
)
|
||||
|
||||
def items(self) -> ConfigModelItemsView:
|
||||
return t.cast("ConfigModelItemsView", [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items()))
|
||||
return t.cast(
|
||||
"ConfigModelItemsView",
|
||||
[(self._load_attr_from_module(key, self._config_mapping[key]),
|
||||
self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())
|
||||
)
|
||||
|
||||
def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
|
||||
return iter(t.cast("SupportsIter[t.Iterator[type[openllm.LLMConfig]]]", self.keys()))
|
||||
|
||||
@@ -3,7 +3,9 @@ import typing as t
|
||||
from collections import OrderedDict
|
||||
from .factory import BaseAutoLLMClass, _LazyAutoMapping
|
||||
from openllm_core.config import CONFIG_MAPPING_NAMES
|
||||
MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
|
||||
MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), (
|
||||
"opt", "OPT"
|
||||
), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
|
||||
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
|
||||
class AutoLLM(BaseAutoLLMClass):
|
||||
_model_mapping: t.ClassVar = MODEL_MAPPING
|
||||
|
||||
@@ -3,7 +3,9 @@ import typing as t
|
||||
from collections import OrderedDict
|
||||
from .factory import BaseAutoLLMClass, _LazyAutoMapping
|
||||
from openllm_core.config import CONFIG_MAPPING_NAMES
|
||||
MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
|
||||
MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), (
|
||||
"opt", "VLLMOPT"
|
||||
), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
|
||||
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
|
||||
class AutoVLLM(BaseAutoLLMClass):
|
||||
_model_mapping: t.ClassVar = MODEL_VLLM_MAPPING
|
||||
|
||||
@@ -4,7 +4,7 @@ from openllm_core._typing_compat import overload
|
||||
from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
|
||||
|
||||
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
|
||||
else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
|
||||
else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
|
||||
logger = logging.getLogger(__name__)
|
||||
@overload
|
||||
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
|
||||
@@ -52,7 +52,12 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
|
||||
input_ids, attention_mask = input_tensors["input_ids"], input_tensors.get("attention_mask", None)
|
||||
if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
|
||||
else: in_b = input_ids.shape[0]
|
||||
generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs)
|
||||
generated_sequence = self.model.generate(
|
||||
input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
|
||||
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
|
||||
pad_token_id=self.tokenizer.pad_token_id,
|
||||
**generate_kwargs
|
||||
)
|
||||
out_b = generated_sequence.shape[0]
|
||||
if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
|
||||
elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
|
||||
|
||||
@@ -12,7 +12,14 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): # type: ignore[attr-defined]
|
||||
return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()
|
||||
),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
|
||||
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
|
||||
@@ -7,7 +7,10 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
|
||||
import torch, torch.nn.functional as F
|
||||
|
||||
@@ -6,11 +6,38 @@ if t.TYPE_CHECKING: import transformers
|
||||
class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
top_p: float | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
decoder_start_token_id: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
if decoder_start_token_id is None: decoder_start_token_id = 0
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
"temperature": temperature,
|
||||
"top_k": top_k,
|
||||
"top_p": top_p,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
"decoder_start_token_id": decoder_start_token_id
|
||||
}, {}
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
# NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
|
||||
decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
|
||||
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(
|
||||
self.tokenizer(prompt, return_tensors="np")["input_ids"],
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
decoder_start_token_id=decoder_start_token_id
|
||||
).sequences,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=True
|
||||
)
|
||||
|
||||
@@ -5,4 +5,7 @@ class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transfo
|
||||
__openllm_internal__ = True
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
@@ -20,4 +20,12 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(
|
||||
self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids,
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
|
||||
)
|
||||
)
|
||||
|
||||
@@ -4,12 +4,17 @@ from openllm.utils import generate_labels, is_triton_available
|
||||
if t.TYPE_CHECKING: import transformers, torch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
|
||||
def get_mpt_config(
|
||||
model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True
|
||||
) -> transformers.PretrainedConfig:
|
||||
import torch
|
||||
config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
|
||||
if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
|
||||
if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
|
||||
else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'")
|
||||
else:
|
||||
logger.debug(
|
||||
"'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
|
||||
)
|
||||
# setting max_seq_len
|
||||
config.max_seq_len = max_sequence_length
|
||||
return config
|
||||
@@ -46,7 +51,9 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
|
||||
device_map = attrs.pop("device_map", None)
|
||||
trust_remote_code = attrs.pop("trust_remote_code", True)
|
||||
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs
|
||||
)
|
||||
model.tie_weights()
|
||||
return model
|
||||
|
||||
@@ -54,7 +61,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
|
||||
import torch
|
||||
llm_config = self.config.model_construct_env(**attrs)
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
|
||||
attrs = {
|
||||
"do_sample": False if llm_config["temperature"] == 0 else True,
|
||||
"eos_token_id": self.tokenizer.eos_token_id,
|
||||
"pad_token_id": self.tokenizer.pad_token_id,
|
||||
"generation_config": llm_config.to_generation_config()
|
||||
}
|
||||
with torch.inference_mode():
|
||||
if torch.cuda.is_available():
|
||||
with torch.autocast("cuda", torch.float16): # type: ignore[attr-defined]
|
||||
|
||||
@@ -13,10 +13,27 @@ class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tok
|
||||
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)
|
||||
)
|
||||
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
repetition_penalty: float | None = None,
|
||||
use_default_prompt_template: bool = False,
|
||||
**attrs: t.Any
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty
|
||||
}, {}
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
@@ -14,4 +14,7 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
@@ -9,7 +9,15 @@ class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Token
|
||||
import transformers
|
||||
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
|
||||
tokenizer.pad_token_id = config.pad_token_id
|
||||
return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
|
||||
return bentoml.transformers.save_model(
|
||||
self.tag,
|
||||
transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
|
||||
custom_objects={"tokenizer": tokenizer},
|
||||
labels=generate_labels(self)
|
||||
)
|
||||
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
|
||||
return self.tokenizer.batch_decode(
|
||||
self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
@@ -7,5 +7,16 @@ class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]):
|
||||
__openllm_internal__ = True
|
||||
tokenizer_id = "local"
|
||||
|
||||
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
|
||||
def sanitize_parameters(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int | None = None,
|
||||
temperature: float | None = None,
|
||||
top_k: int | None = None,
|
||||
num_return_sequences: int | None = None,
|
||||
use_default_prompt_template: bool = True,
|
||||
**attrs: t.Any
|
||||
) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
|
||||
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
|
||||
"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences
|
||||
}, {}
|
||||
|
||||
@@ -16,4 +16,15 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
|
||||
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
|
||||
import torch
|
||||
with torch.inference_mode():
|
||||
return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], skip_special_tokens=True)]
|
||||
return [
|
||||
self.tokenizer.decode(
|
||||
self.model.generate(
|
||||
**self.tokenizer(prompt, return_tensors="pt").to(self.device),
|
||||
do_sample=True,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
|
||||
)[0],
|
||||
skip_special_tokens=True
|
||||
)
|
||||
]
|
||||
|
||||
@@ -27,7 +27,12 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
|
||||
with torch.inference_mode():
|
||||
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
|
||||
# NOTE: support fine-tuning starcoder
|
||||
result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
result_tensor = self.model.generate(
|
||||
self.tokenizer.encode(prompt, return_tensors="pt").to(self.device),
|
||||
do_sample=True,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()
|
||||
)
|
||||
# TODO: We will probably want to return the tokenizer here so that we can manually process this
|
||||
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
|
||||
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
||||
|
||||
@@ -47,9 +47,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
|
||||
try:
|
||||
tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"]
|
||||
except KeyError:
|
||||
raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save"
|
||||
" the tokenizer within the model via 'custom_objects'."
|
||||
" For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
"Bento model does not have tokenizer. Make sure to save"
|
||||
" the tokenizer within the model via 'custom_objects'."
|
||||
" For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\""
|
||||
) from None
|
||||
else:
|
||||
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath("/"), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs)
|
||||
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
from __future__ import annotations
|
||||
FRAMEWORK_TO_AUTOCLASS_MAPPING = {"pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"), "tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"), "flax": ("FlaxAutoModelForCausalLM", "FlaxAutoModelForSeq2SeqLM"), "vllm": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM")}
|
||||
FRAMEWORK_TO_AUTOCLASS_MAPPING = {
|
||||
"pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"),
|
||||
"tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"),
|
||||
"flax": ("FlaxAutoModelForCausalLM", "FlaxAutoModelForSeq2SeqLM"),
|
||||
"vllm": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM")
|
||||
}
|
||||
HUB_ATTRS = ["cache_dir", "code_revision", "force_download", "local_files_only", "proxies", "resume_download", "revision", "subfolder", "use_auth_token"]
|
||||
|
||||
@@ -55,7 +55,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
signatures: DictStrAny = {}
|
||||
|
||||
if quantize_method == "gptq":
|
||||
if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
signatures["generate"] = {"batchable": False}
|
||||
else:
|
||||
@@ -70,16 +71,33 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
|
||||
|
||||
external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)]
|
||||
imported_modules: list[types.ModuleType] = []
|
||||
bentomodel = bentoml.Model.create(llm.tag, module="openllm.serialisation.transformers", api_version="v1", options=ModelOptions(), context=openllm.utils.generate_context(framework_name="openllm"), labels=openllm.utils.generate_labels(llm), signatures=signatures if signatures else make_model_signatures(llm))
|
||||
bentomodel = bentoml.Model.create(
|
||||
llm.tag,
|
||||
module="openllm.serialisation.transformers",
|
||||
api_version="v1",
|
||||
options=ModelOptions(),
|
||||
context=openllm.utils.generate_context(framework_name="openllm"),
|
||||
labels=openllm.utils.generate_labels(llm),
|
||||
signatures=signatures if signatures else make_model_signatures(llm)
|
||||
)
|
||||
with openllm.utils.analytics.set_bentoml_tracking():
|
||||
try:
|
||||
bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
|
||||
tokenizer.save_pretrained(bentomodel.path)
|
||||
if quantize_method == "gptq":
|
||||
if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
logger.debug("Saving model with GPTQ quantisation will require loading model into memory.")
|
||||
model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id, *decls, quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), trust_remote_code=trust_remote_code, use_safetensors=safe_serialisation, **hub_attrs, **attrs,)
|
||||
model = autogptq.AutoGPTQForCausalLM.from_quantized(
|
||||
llm.model_id,
|
||||
*decls,
|
||||
quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
|
||||
trust_remote_code=trust_remote_code,
|
||||
use_safetensors=safe_serialisation,
|
||||
**hub_attrs,
|
||||
**attrs,
|
||||
)
|
||||
update_model(bentomodel, metadata={"_pretrained_class": model.__class__.__name__, "_framework": model.model.framework})
|
||||
model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
|
||||
else:
|
||||
@@ -120,8 +138,10 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
|
||||
"""
|
||||
try:
|
||||
model = bentoml.models.get(llm.tag)
|
||||
if model.info.module not in ("openllm.serialisation.transformers"
|
||||
"bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__): # NOTE: backward compatible with previous version of OpenLLM.
|
||||
if model.info.module not in (
|
||||
"openllm.serialisation.transformers"
|
||||
"bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__
|
||||
): # NOTE: backward compatible with previous version of OpenLLM.
|
||||
raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
|
||||
if "runtime" in model.info.labels and model.info.labels["runtime"] != llm.runtime:
|
||||
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
|
||||
@@ -136,26 +156,51 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
"""
|
||||
config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
|
||||
safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get("safe_serialisation", None)), attrs.pop("safe_serialization", None), default=llm._serialisation_format == "safetensors")
|
||||
safe_serialization = openllm.utils.first_not_none(
|
||||
t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get("safe_serialisation", None)), attrs.pop("safe_serialization", None), default=llm._serialisation_format == "safetensors"
|
||||
)
|
||||
if "_quantize" in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata["_quantize"] == "gptq":
|
||||
if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, *decls, quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), trust_remote_code=llm.__llm_trust_remote_code__, use_safetensors=safe_serialization, **hub_attrs, **attrs)
|
||||
return autogptq.AutoGPTQForCausalLM.from_quantized(
|
||||
llm._bentomodel.path,
|
||||
*decls,
|
||||
quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
use_safetensors=safe_serialization,
|
||||
**hub_attrs,
|
||||
**attrs
|
||||
)
|
||||
|
||||
device_map = attrs.pop("device_map", "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs).eval()
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(
|
||||
llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs
|
||||
).eval()
|
||||
# BetterTransformer is currently only supported on PyTorch.
|
||||
if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
|
||||
if llm.__llm_implementation__ in {"pt", "vllm"}: check_unintialised_params(model)
|
||||
return t.cast("M", model)
|
||||
def save_pretrained(llm: openllm.LLM[M, T], save_directory: str, is_main_process: bool = True, state_dict: DictStrAny | None = None, save_function: t.Any | None = None, push_to_hub: bool = False, max_shard_size: int | str = "10GB", safe_serialization: bool = False, variant: str | None = None, **attrs: t.Any) -> None:
|
||||
def save_pretrained(
|
||||
llm: openllm.LLM[M, T],
|
||||
save_directory: str,
|
||||
is_main_process: bool = True,
|
||||
state_dict: DictStrAny | None = None,
|
||||
save_function: t.Any | None = None,
|
||||
push_to_hub: bool = False,
|
||||
max_shard_size: int | str = "10GB",
|
||||
safe_serialization: bool = False,
|
||||
variant: str | None = None,
|
||||
**attrs: t.Any
|
||||
) -> None:
|
||||
save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
|
||||
model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
|
||||
safe_serialization = safe_serialization or llm._serialisation_format == "safetensors"
|
||||
# NOTE: disable safetensors for vllm
|
||||
if llm.__llm_implementation__ == "vllm": safe_serialization = False
|
||||
if llm._quantize_method == "gptq":
|
||||
if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
|
||||
if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM): raise ValueError(f"Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})")
|
||||
t.cast("autogptq.modeling.BaseGPTQForCausalLM", llm.model).save_quantized(save_directory, use_safetensors=safe_serialization)
|
||||
@@ -165,5 +210,15 @@ def save_pretrained(llm: openllm.LLM[M, T], save_directory: str, is_main_process
|
||||
llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
|
||||
else:
|
||||
# We can safely cast here since it will be the PreTrainedModel protocol.
|
||||
t.cast("transformers.PreTrainedModel", llm.model).save_pretrained(save_directory, is_main_process=is_main_process, state_dict=state_dict, save_function=save_function, push_to_hub=push_to_hub, max_shard_size=max_shard_size, safe_serialization=safe_serialization, variant=variant, **model_save_attrs)
|
||||
t.cast("transformers.PreTrainedModel", llm.model).save_pretrained(
|
||||
save_directory,
|
||||
is_main_process=is_main_process,
|
||||
state_dict=state_dict,
|
||||
save_function=save_function,
|
||||
push_to_hub=push_to_hub,
|
||||
max_shard_size=max_shard_size,
|
||||
safe_serialization=safe_serialization,
|
||||
variant=variant,
|
||||
**model_save_attrs
|
||||
)
|
||||
llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
|
||||
|
||||
@@ -38,7 +38,8 @@ def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
|
||||
def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass:
|
||||
if llm.config["trust_remote_code"]:
|
||||
autoclass = "AutoModelForSeq2SeqLM" if llm.config["model_type"] == "seq2seq_lm" else "AutoModelForCausalLM"
|
||||
if not hasattr(config, "auto_map"): raise ValueError(f"Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping")
|
||||
if not hasattr(config, "auto_map"):
|
||||
raise ValueError(f"Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping")
|
||||
# in case this model doesn't use the correct auto class for model type, for example like chatglm
|
||||
# where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
|
||||
if autoclass not in config.auto_map: autoclass = "AutoModel"
|
||||
|
||||
@@ -5,7 +5,9 @@ if t.TYPE_CHECKING: from ._typing_compat import LiteralRuntime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@contextlib.contextmanager
|
||||
def build_bento(model: str, model_id: str | None = None, quantize: t.Literal["int4", "int8", "gptq"] | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
|
||||
def build_bento(
|
||||
model: str, model_id: str | None = None, quantize: t.Literal["int4", "int8", "gptq"] | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", cleanup: bool = False
|
||||
) -> t.Iterator[bentoml.Bento]:
|
||||
logger.info("Building BentoML for %s", model)
|
||||
bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
|
||||
yield bento
|
||||
@@ -28,7 +30,14 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | N
|
||||
logger.info("Deleting container %s", image_tag)
|
||||
subprocess.check_output([executable, "rmi", "-f", image_tag])
|
||||
@contextlib.contextmanager
|
||||
def prepare(model: str, model_id: str | None = None, implementation: LiteralRuntime = "pt", deployment_mode: t.Literal["container", "local"] = "local", clean_context: contextlib.ExitStack | None = None, cleanup: bool = True) -> t.Iterator[str]:
|
||||
def prepare(
|
||||
model: str,
|
||||
model_id: str | None = None,
|
||||
implementation: LiteralRuntime = "pt",
|
||||
deployment_mode: t.Literal["container", "local"] = "local",
|
||||
clean_context: contextlib.ExitStack | None = None,
|
||||
cleanup: bool = True
|
||||
) -> t.Iterator[str]:
|
||||
if clean_context is None:
|
||||
clean_context = contextlib.ExitStack()
|
||||
cleanup = True
|
||||
|
||||
Reference in New Issue
Block a user