mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-05-19 14:16:22 -04:00
refactor(breaking): unify LLM API (#283)
Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -47,7 +47,7 @@ _import_structure: dict[str, list[str]] = {
|
||||
"cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
|
||||
"_quantisation": ["infer_quantisation_config"],
|
||||
"_embeddings": ["GenericEmbeddingRunnable"],
|
||||
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"],
|
||||
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
|
||||
"_generation": [
|
||||
"StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
|
||||
"prepare_logits_processor"
|
||||
@@ -72,7 +72,7 @@ COMPILED = _Path(__file__).suffix in (".pyd", ".so")
|
||||
if _t.TYPE_CHECKING:
|
||||
from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
|
||||
from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
|
||||
from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
|
||||
from ._llm import LLM as LLM, EmbeddingsOutput as EmbeddingsOutput, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
|
||||
from ._quantisation import infer_quantisation_config as infer_quantisation_config
|
||||
from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
|
||||
from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
|
||||
@@ -196,7 +196,12 @@ else:
|
||||
__lazy = openllm_core.utils.LazyModule(__name__,
|
||||
globals()["__file__"],
|
||||
_import_structure,
|
||||
extra_objects={"COMPILED": COMPILED})
|
||||
extra_objects={
|
||||
"COMPILED": COMPILED,
|
||||
"__openllm_migration__": {
|
||||
"LLMEmbeddings": "EmbeddingsOutput"
|
||||
}
|
||||
})
|
||||
__all__ = __lazy.__all__
|
||||
__dir__ = __lazy.__dir__
|
||||
__getattr__ = __lazy.__getattr__
|
||||
|
||||
201
openllm-python/src/openllm/_assign.py
Normal file
201
openllm-python/src/openllm/_assign.py
Normal file
@@ -0,0 +1,201 @@
|
||||
'''LLM assignment magik.'''
|
||||
from __future__ import annotations
|
||||
import functools
|
||||
import traceback
|
||||
import typing as t
|
||||
|
||||
import openllm
|
||||
|
||||
from openllm.exceptions import OpenLLMException
|
||||
from openllm_core._configuration import _object_getattribute
|
||||
from openllm_core._configuration import _setattr_class
|
||||
from openllm_core._schema import unmarshal_vllm_outputs
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import ListStr
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import T
|
||||
from openllm_core._typing_compat import import_model_protocol
|
||||
from openllm_core._typing_compat import llm_post_init_protocol
|
||||
from openllm_core._typing_compat import load_model_protocol
|
||||
from openllm_core._typing_compat import load_tokenizer_protocol
|
||||
from openllm_core.utils import LazyLoader
|
||||
from openllm_core.utils import codegen
|
||||
from openllm_core.utils import device_count
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import is_torch_available
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import torch
|
||||
import vllm
|
||||
|
||||
import bentoml
|
||||
|
||||
from openllm._llm import LLM
|
||||
else:
|
||||
torch = LazyLoader('torch', globals(), 'torch')
|
||||
vllm = LazyLoader('vllm', globals(), 'vllm')
|
||||
|
||||
def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
|
||||
|
||||
@functools.wraps(fn)
|
||||
def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
|
||||
trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
decls = (*model_decls, *decls)
|
||||
attrs = {**model_attrs, **attrs}
|
||||
return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs)
|
||||
|
||||
return inner
|
||||
|
||||
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
|
||||
|
||||
@functools.wraps(fn)
|
||||
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
|
||||
if self.__llm_backend__ == 'vllm':
|
||||
# TODO: Do some more processing with token_id once we support token streaming
|
||||
try:
|
||||
return vllm.LLMEngine.from_engine_args(
|
||||
vllm.EngineArgs(model=self._bentomodel.path,
|
||||
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
|
||||
tokenizer_mode='auto',
|
||||
tensor_parallel_size=1 if device_count() < 2 else device_count(),
|
||||
dtype='auto',
|
||||
worker_use_ray=False))
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
|
||||
else:
|
||||
(model_decls, model_attrs), _ = self.llm_parameters
|
||||
return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
|
||||
|
||||
return inner
|
||||
|
||||
def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]:
|
||||
|
||||
@functools.wraps(fn)
|
||||
def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
|
||||
return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
|
||||
|
||||
return inner
|
||||
|
||||
def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]:
|
||||
|
||||
@functools.wraps(fn)
|
||||
def inner(self: LLM[M, T]) -> None:
|
||||
if self.__llm_backend__ == 'pt' and is_torch_available():
|
||||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
fn(self)
|
||||
|
||||
return inner
|
||||
|
||||
def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
|
||||
'''Make LLM attributes for the given LLM subclass.'''
|
||||
from ._llm import LLM
|
||||
from ._llm import LLMFunction
|
||||
from ._llm import LLMInterface
|
||||
from ._llm import LLMSerialisation
|
||||
|
||||
args: ListStr = []
|
||||
globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM}
|
||||
# _cached_LLMFunction_get and _ccached_LLMSerialisation_get
|
||||
globs.update(
|
||||
{f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
|
||||
# llm_post_init implementation
|
||||
lines: ListStr = [
|
||||
f'_impl_{cls.__name__}_func=cls.llm_post_init',
|
||||
_setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')
|
||||
]
|
||||
|
||||
serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,}
|
||||
for func, impl in serialisation_attr.items():
|
||||
impl_name = f'__wrapped_{func}'
|
||||
globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
|
||||
cached_func_name = f'_cached_{cls.__name__}_func'
|
||||
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
|
||||
lines.extend([
|
||||
f'{cached_func_name}=cls.{func}', func_call,
|
||||
_setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')
|
||||
])
|
||||
|
||||
# assign vLLM implementation
|
||||
if cls.__llm_backend__ == 'vllm':
|
||||
vllm_func = {
|
||||
f'_vllm_{it}': fn
|
||||
for it, fn in zip(('generate', 'generate_iterator',
|
||||
'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
|
||||
}
|
||||
globs.update(vllm_func)
|
||||
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
|
||||
|
||||
interface_anns = codegen.get_annotations(LLMInterface)
|
||||
|
||||
# cached attribute initialisation
|
||||
def dunder_cached(key: str) -> str:
|
||||
return f'__llm_{key}__'
|
||||
|
||||
st_attr = {'model', 'tokenizer', 'adapter_map'}
|
||||
lines.extend([_setattr_class(dunder_cached(v), None) for v in st_attr])
|
||||
|
||||
# boolean for better LLM implementation resolver
|
||||
def dunder_support(key: str) -> str:
|
||||
return f'__llm_supports_{key}__'
|
||||
|
||||
bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
|
||||
lines.extend(
|
||||
[_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
|
||||
|
||||
return codegen.generate_function(cls,
|
||||
'__assign_llm_attr',
|
||||
lines,
|
||||
args=('cls', *args),
|
||||
globs=globs,
|
||||
annotations={
|
||||
'cls': 't.Type[LLM]',
|
||||
'return': None
|
||||
})
|
||||
|
||||
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
|
||||
**_: t.Any) -> str:
|
||||
return generation_result[0]['outputs'][0]['text']
|
||||
|
||||
def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
|
||||
prompt: str,
|
||||
/,
|
||||
*,
|
||||
echo: bool = False,
|
||||
stop: str | t.Iterable[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
**attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
|
||||
request_id: str | None = attrs.pop('request_id', None)
|
||||
if request_id is None: raise ValueError('request_id must not be None.')
|
||||
if stop_token_ids is None: stop_token_ids = []
|
||||
stop_token_ids.append(self.tokenizer.eos_token_id)
|
||||
stop_: set[str] = set()
|
||||
if isinstance(stop, str) and stop != '': stop_.add(stop)
|
||||
elif isinstance(stop, list) and stop != []: stop_.update(stop)
|
||||
for tid in stop_token_ids:
|
||||
if tid: stop_.add(self.tokenizer.decode(tid))
|
||||
|
||||
if self.config['temperature'] <= 1e-5: top_p = 1.0
|
||||
else: top_p = self.config['top_p']
|
||||
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
|
||||
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
|
||||
while self.model.has_unfinished_requests():
|
||||
for request_output in self.model.step():
|
||||
prompt = request_output.prompt
|
||||
if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
|
||||
else: text_outputs = [output.text for output in request_output.outputs]
|
||||
yield {'text': text_outputs, 'error_code': 0}
|
||||
if request_output.finished: break
|
||||
|
||||
def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
|
||||
request_id: str | None = attrs.pop('request_id', None)
|
||||
if request_id is None: raise ValueError('request_id must not be None.')
|
||||
outputs: list[vllm.RequestOutput] = []
|
||||
# TODO: support prompt_token_ids
|
||||
self.model.add_request(request_id=request_id,
|
||||
prompt=prompt,
|
||||
sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
|
||||
while self.model.has_unfinished_requests():
|
||||
outputs.extend([r for r in self.model.step() if r.finished])
|
||||
return [unmarshal_vllm_outputs(i) for i in outputs]
|
||||
@@ -58,7 +58,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
|
||||
self.model.to(self.device)
|
||||
|
||||
@bentoml.Runnable.method(batchable=True, batch_dim=0)
|
||||
def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
|
||||
def encode(self, sentences: list[str]) -> t.Sequence[openllm.EmbeddingsOutput]:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device)
|
||||
@@ -69,8 +69,8 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
|
||||
# Perform pooling and normalize
|
||||
sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
|
||||
return [
|
||||
openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(),
|
||||
num_tokens=int(torch.sum(attention_mask).item()))
|
||||
openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(),
|
||||
num_tokens=int(torch.sum(attention_mask).item()))
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -78,7 +78,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
|
||||
'model_id': runner.llm.model_id,
|
||||
'timeout': 3600,
|
||||
'model_name': llm_config['model_name'],
|
||||
'framework': runner.llm_framework,
|
||||
'backend': runner.backend,
|
||||
'configuration': '',
|
||||
'supports_embeddings': runner.supports_embeddings,
|
||||
'supports_hf_agent': runner.supports_hf_agent
|
||||
@@ -86,7 +86,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return openllm.MetadataOutput(timeout=llm_config['timeout'],
|
||||
model_name=llm_config['model_name'],
|
||||
framework=llm_config['env']['framework_value'],
|
||||
backend=llm_config['env']['backend_value'],
|
||||
model_id=runner.llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
supports_embeddings=runner.supports_embeddings,
|
||||
|
||||
@@ -86,17 +86,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
|
||||
packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
|
||||
|
||||
env = llm.config['env']
|
||||
framework_envvar = env['framework_value']
|
||||
if framework_envvar == 'flax':
|
||||
backend_envvar = env['backend_value']
|
||||
if backend_envvar == 'flax':
|
||||
if not openllm_core.utils.is_flax_available():
|
||||
raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
|
||||
raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'")
|
||||
packages.extend(
|
||||
[importlib.metadata.version('flax'),
|
||||
importlib.metadata.version('jax'),
|
||||
importlib.metadata.version('jaxlib')])
|
||||
elif framework_envvar == 'tf':
|
||||
elif backend_envvar == 'tf':
|
||||
if not openllm_core.utils.is_tf_available():
|
||||
raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
|
||||
raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
|
||||
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
|
||||
'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
|
||||
)
|
||||
@@ -125,21 +125,22 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
|
||||
return PythonOptions(packages=packages,
|
||||
wheels=wheels,
|
||||
lock_packages=False,
|
||||
extra_index_url=['https://download.pytorch.org/whl/cu118'])
|
||||
extra_index_url=[
|
||||
'https://download.pytorch.org/whl/cu118',
|
||||
'https://huggingface.github.io/autogptq-index/whl/cu118/'
|
||||
])
|
||||
|
||||
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
|
||||
quantize: LiteralString | None, bettertransformer: bool | None,
|
||||
adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
|
||||
runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors',
|
||||
'legacy'],
|
||||
quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
|
||||
dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
|
||||
from openllm.cli._factory import parse_config_options
|
||||
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
|
||||
env: openllm_core.utils.EnvVarMixin = llm.config['env']
|
||||
if env['framework_value'] == 'vllm': serialisation_format = 'legacy'
|
||||
if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
|
||||
env_dict = {
|
||||
env.framework: env['framework_value'],
|
||||
env.backend: env['backend_value'],
|
||||
env.config: f"'{llm.config.model_dump_json().decode()}'",
|
||||
env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
|
||||
'OPENLLM_MODEL': llm.config['model_name'],
|
||||
@@ -152,14 +153,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
|
||||
if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')
|
||||
|
||||
# We need to handle None separately here, as env from subprocess doesn't accept None value.
|
||||
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'],
|
||||
bettertransformer=bettertransformer,
|
||||
quantize=quantize,
|
||||
runtime=runtime)
|
||||
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
|
||||
|
||||
env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
|
||||
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
|
||||
env_dict[_env.runtime] = _env['runtime_value']
|
||||
return DockerOptions(
|
||||
base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
|
||||
env=env_dict,
|
||||
@@ -218,21 +214,19 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
llm: openllm.LLM[t.Any, t.Any],
|
||||
workers_per_resource: str | float,
|
||||
quantize: LiteralString | None,
|
||||
bettertransformer: bool | None,
|
||||
dockerfile_template: str | None,
|
||||
adapter_map: dict[str, str | None] | None = None,
|
||||
extra_dependencies: tuple[str, ...] | None = None,
|
||||
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
|
||||
container_registry: LiteralContainerRegistry = 'ecr',
|
||||
container_version_strategy: LiteralContainerVersionStrategy = 'release',
|
||||
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
|
||||
_model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
|
||||
framework_envvar = llm.config['env']['framework_value']
|
||||
backend_envvar = llm.config['env']['backend_value']
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({
|
||||
'_type': llm.llm_type,
|
||||
'_framework': framework_envvar,
|
||||
'_framework': backend_envvar,
|
||||
'start_name': llm.config['start_name'],
|
||||
'base_name_or_path': llm.model_id,
|
||||
'bundler': 'openllm.bundle'
|
||||
@@ -265,8 +259,8 @@ def create_bento(bento_tag: bentoml.Tag,
|
||||
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
|
||||
models=[llm_spec],
|
||||
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
|
||||
bettertransformer, adapter_map, dockerfile_template,
|
||||
runtime, serialisation_format, container_registry,
|
||||
adapter_map, dockerfile_template,
|
||||
serialisation_format, container_registry,
|
||||
container_version_strategy))
|
||||
|
||||
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
|
||||
|
||||
@@ -94,7 +94,7 @@ class RefResolver:
|
||||
git_hash: str = attr.field()
|
||||
version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
|
||||
strategy: LiteralContainerVersionStrategy = attr.field()
|
||||
_ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
|
||||
_ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
|
||||
|
||||
@classmethod
|
||||
def _nightly_ref(cls) -> RefTuple:
|
||||
|
||||
@@ -16,12 +16,15 @@ from click.shell_completion import CompletionItem
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
import openllm_core
|
||||
|
||||
from bentoml._internal.configuration.containers import BentoMLContainer
|
||||
from openllm_core._typing_compat import Concatenate
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import get_literal_args
|
||||
from openllm_core.utils import DEBUG
|
||||
|
||||
from . import termui
|
||||
@@ -147,14 +150,12 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
@click.pass_context
|
||||
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
|
||||
runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
|
||||
'legacy'],
|
||||
cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None,
|
||||
return_process: bool, **attrs: t.Any,
|
||||
) -> LLMConfig | subprocess.Popen[bytes]:
|
||||
fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
|
||||
if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
|
||||
'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
|
||||
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env(
|
||||
'OPENLLM_SERIALIZATION_WARNING'):
|
||||
termui.echo(
|
||||
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
|
||||
fg='yellow')
|
||||
@@ -184,20 +185,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
|
||||
# Create a new model env to work with the envvar during CLI invocation
|
||||
env = openllm.utils.EnvVarMixin(config['model_name'],
|
||||
config.default_implementation(),
|
||||
backend,
|
||||
model_id=model_id or config['default_id'],
|
||||
bettertransformer=bettertransformer,
|
||||
quantize=quantize,
|
||||
runtime=runtime)
|
||||
prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
|
||||
quantize=quantize)
|
||||
requirements = llm_config['requirements']
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
if len(missing_requirements) > 0:
|
||||
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
|
||||
|
||||
# NOTE: This is to set current configuration
|
||||
start_env = os.environ.copy()
|
||||
start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
|
||||
if fast:
|
||||
termui.echo(
|
||||
f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
|
||||
fg='yellow')
|
||||
|
||||
start_env.update({
|
||||
'OPENLLM_MODEL': model,
|
||||
@@ -205,21 +204,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
|
||||
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
|
||||
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
|
||||
'OPENLLM_SERIALIZATION': serialisation_format,
|
||||
env.runtime: env['runtime_value'],
|
||||
env.framework: env['framework_value']
|
||||
env.backend: env['backend_value']
|
||||
})
|
||||
if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
|
||||
# NOTE: quantize and bettertransformer value is already assigned within env
|
||||
if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
|
||||
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
|
||||
|
||||
llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
|
||||
model_id=start_env[env.model_id],
|
||||
model_version=model_version,
|
||||
llm_config=config,
|
||||
ensure_available=not fast,
|
||||
adapter_map=adapter_map,
|
||||
serialisation=serialisation_format)
|
||||
llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
|
||||
model_id=start_env[env.model_id],
|
||||
model_version=model_version,
|
||||
llm_config=config,
|
||||
ensure_available=True,
|
||||
adapter_map=adapter_map,
|
||||
serialisation=serialisation_format)
|
||||
start_env.update({env.config: llm.config.model_dump_json().decode()})
|
||||
|
||||
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
|
||||
@@ -268,21 +264,6 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
|
||||
|
||||
return noop
|
||||
|
||||
def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
|
||||
adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
|
||||
if adapter_map and not openllm.utils.is_peft_available():
|
||||
ctx.fail(
|
||||
"Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
|
||||
if quantize and llm_config.default_implementation() == 'vllm':
|
||||
ctx.fail(
|
||||
f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
|
||||
)
|
||||
requirements = llm_config['requirements']
|
||||
if requirements is not None and len(requirements) > 0:
|
||||
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
|
||||
if len(missing_requirements) > 0:
|
||||
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
|
||||
|
||||
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
|
||||
|
||||
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
|
||||
@@ -291,22 +272,21 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
cog.optgroup.group(
|
||||
'General LLM Options',
|
||||
help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
|
||||
model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
|
||||
model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
|
||||
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
|
||||
fast_option(factory=cog.optgroup),
|
||||
backend_option(factory=cog.optgroup),
|
||||
cog.optgroup.group('LLM Optimization Options',
|
||||
help='''Optimization related options.
|
||||
|
||||
OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
|
||||
k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
|
||||
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
|
||||
|
||||
The following are either in our roadmap or currently being worked on:
|
||||
|
||||
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
|
||||
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
|
||||
''',
|
||||
),
|
||||
), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
|
||||
cog.optgroup.option('--device',
|
||||
type=openllm.utils.dantic.CUDA,
|
||||
multiple=True,
|
||||
@@ -314,13 +294,6 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
|
||||
callback=parse_device_callback,
|
||||
help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
|
||||
show_envvar=True),
|
||||
cog.optgroup.option('--runtime',
|
||||
type=click.Choice(['ggml', 'transformers']),
|
||||
default='transformers',
|
||||
help='The runtime to use for the given model. Default is transformers.'),
|
||||
quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
|
||||
bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
|
||||
serialisation_option(factory=cog.optgroup),
|
||||
cog.optgroup.group('Fine-tuning related options',
|
||||
help='''\
|
||||
Note that the argument `--adapter-id` can accept the following format:
|
||||
@@ -439,18 +412,6 @@ def output_option(f: _AnyCallable | None = None,
|
||||
shell_complete=complete_output_var,
|
||||
**attrs)(f)
|
||||
|
||||
def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--fast/--no-fast',
|
||||
show_default=True,
|
||||
default=False,
|
||||
envvar='OPENLLM_USE_LOCAL_LATEST',
|
||||
show_envvar=True,
|
||||
help='''Whether to skip checking if models is already in store.
|
||||
|
||||
This is useful if you already downloaded or setup the model beforehand.
|
||||
''',
|
||||
**attrs)(f)
|
||||
|
||||
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--cors/--no-cors',
|
||||
show_default=True,
|
||||
@@ -463,15 +424,12 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
|
||||
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
|
||||
|
||||
def model_id_option(f: _AnyCallable | None = None,
|
||||
*,
|
||||
model_env: openllm.utils.EnvVarMixin | None = None,
|
||||
**attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--model-id',
|
||||
type=click.STRING,
|
||||
default=None,
|
||||
envvar=model_env.model_id if model_env is not None else None,
|
||||
show_envvar=model_env is not None,
|
||||
envvar='OPENLLM_MODEL_ID',
|
||||
show_envvar=True,
|
||||
help='Optional model_id name or path for (fine-tune) weight.',
|
||||
**attrs)(f)
|
||||
|
||||
@@ -483,24 +441,31 @@ def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
|
||||
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
|
||||
**attrs)(f)
|
||||
|
||||
def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
# NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
|
||||
# XXX: remove the check for __args__ once we have ggml and mlc supports
|
||||
return cli_option('--backend',
|
||||
type=click.Choice(get_literal_args(LiteralBackend)[:-2]),
|
||||
default='pt',
|
||||
envvar='OPENLLM_BACKEND',
|
||||
show_envvar=True,
|
||||
help='The implementation for saving this LLM.',
|
||||
**attrs)(f)
|
||||
|
||||
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_argument('model_name',
|
||||
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
|
||||
required=required,
|
||||
**attrs)(f)
|
||||
|
||||
def quantize_option(f: _AnyCallable | None = None,
|
||||
*,
|
||||
build: bool = False,
|
||||
model_env: openllm.utils.EnvVarMixin | None = None,
|
||||
**attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--quantise',
|
||||
'--quantize',
|
||||
'quantize',
|
||||
type=click.Choice(['int8', 'int4', 'gptq']),
|
||||
default=None,
|
||||
envvar=model_env.quantize if model_env is not None else None,
|
||||
show_envvar=model_env is not None,
|
||||
envvar='OPENLLM_QUANTIZE',
|
||||
show_envvar=True,
|
||||
help='''Dynamic quantization for running this LLM.
|
||||
|
||||
The following quantization strategies are supported:
|
||||
@@ -542,21 +507,6 @@ def workers_per_resource_option(f: _AnyCallable | None = None,
|
||||
> ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
|
||||
**attrs)(f)
|
||||
|
||||
def bettertransformer_option(f: _AnyCallable | None = None,
|
||||
*,
|
||||
build: bool = False,
|
||||
model_env: openllm.utils.EnvVarMixin | None = None,
|
||||
**attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option(
|
||||
'--bettertransformer',
|
||||
is_flag=True,
|
||||
default=None,
|
||||
envvar=model_env.bettertransformer if model_env is not None else None,
|
||||
show_envvar=model_env is not None,
|
||||
help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
|
||||
'Set default environment variable whether to serve this model with FasterTransformer in build time.',
|
||||
**attrs)(f)
|
||||
|
||||
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--serialisation',
|
||||
'--serialization',
|
||||
@@ -586,22 +536,18 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
|
||||
**attrs)(f)
|
||||
|
||||
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
|
||||
return cli_option('--container-registry',
|
||||
'container_registry',
|
||||
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
|
||||
default='ecr',
|
||||
show_default=True,
|
||||
show_envvar=True,
|
||||
envvar='OPENLLM_CONTAINER_REGISTRY',
|
||||
callback=container_registry_callback,
|
||||
help='''The default container registry to get the base image for building BentoLLM.
|
||||
|
||||
Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
|
||||
|
||||
\b
|
||||
> [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
|
||||
''',
|
||||
**attrs)(f)
|
||||
return cli_option(
|
||||
'--container-registry',
|
||||
'container_registry',
|
||||
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
|
||||
default='ecr',
|
||||
show_default=True,
|
||||
show_envvar=True,
|
||||
envvar='OPENLLM_CONTAINER_REGISTRY',
|
||||
callback=container_registry_callback,
|
||||
help=
|
||||
'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
|
||||
**attrs)(f)
|
||||
|
||||
_wpr_strategies = {'round_robin', 'conserved'}
|
||||
|
||||
|
||||
@@ -23,9 +23,9 @@ from ._factory import start_command_factory
|
||||
if t.TYPE_CHECKING:
|
||||
from bentoml._internal.bento import BentoStore
|
||||
from openllm_core._configuration import LLMConfig
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralContainerRegistry
|
||||
from openllm_core._typing_compat import LiteralContainerVersionStrategy
|
||||
from openllm_core._typing_compat import LiteralRuntime
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -38,10 +38,8 @@ def _start(model_name: str,
|
||||
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
|
||||
device: tuple[str, ...] | t.Literal['all'] | None = None,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
bettertransformer: bool | None = None,
|
||||
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
|
||||
adapter_map: dict[LiteralString, str | None] | None = None,
|
||||
framework: LiteralRuntime | None = None,
|
||||
backend: LiteralBackend | None = None,
|
||||
additional_args: list[str] | None = None,
|
||||
cors: bool = False,
|
||||
_serve_grpc: bool = False,
|
||||
@@ -57,48 +55,42 @@ def _start(model_name: str,
|
||||
|
||||
``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
|
||||
|
||||
> [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
|
||||
|
||||
Args:
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
timeout: The server timeout
|
||||
workers_per_resource: Number of workers per resource assigned.
|
||||
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
|
||||
for more information. By default, this is set to 1.
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
timeout: The server timeout
|
||||
workers_per_resource: Number of workers per resource assigned.
|
||||
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
|
||||
for more information. By default, this is set to 1.
|
||||
|
||||
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
|
||||
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
> - ``conserved``: This will determine the number of available GPU resources, and only assign
|
||||
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
|
||||
> equivalent to ``--workers-per-resource 0.25``.
|
||||
device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
|
||||
argument to assign all available GPUs to this LLM.
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
bettertransformer: Convert given model to FastTransformer with PyTorch.
|
||||
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
|
||||
cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
|
||||
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
|
||||
framework: The framework to use for this LLM. By default, this is set to ``pt``.
|
||||
additional_args: Additional arguments to pass to ``openllm start``.
|
||||
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
|
||||
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
> - ``conserved``: This will determine the number of available GPU resources, and only assign
|
||||
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
|
||||
> equivalent to ``--workers-per-resource 0.25``.
|
||||
device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
|
||||
argument to assign all available GPUs to this LLM.
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
|
||||
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
|
||||
backend: The backend to use for this LLM. By default, this is set to ``pt``.
|
||||
additional_args: Additional arguments to pass to ``openllm start``.
|
||||
"""
|
||||
from .entrypoint import start_command
|
||||
from .entrypoint import start_grpc_command
|
||||
llm_config = openllm.AutoConfig.for_model(model_name)
|
||||
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
|
||||
openllm_core.utils.first_not_none(
|
||||
framework, default=llm_config.default_implementation()),
|
||||
backend=openllm_core.utils.first_not_none(
|
||||
backend, default=llm_config.default_backend()),
|
||||
model_id=model_id,
|
||||
bettertransformer=bettertransformer,
|
||||
quantize=quantize,
|
||||
runtime=runtime)
|
||||
os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']
|
||||
quantize=quantize)
|
||||
os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
|
||||
|
||||
args: list[str] = ['--runtime', runtime]
|
||||
args: list[str] = []
|
||||
if model_id: args.extend(['--model-id', model_id])
|
||||
if timeout: args.extend(['--server-timeout', str(timeout)])
|
||||
if workers_per_resource:
|
||||
@@ -107,10 +99,7 @@ def _start(model_name: str,
|
||||
str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
|
||||
])
|
||||
if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
|
||||
if quantize and bettertransformer:
|
||||
raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
|
||||
if quantize: args.extend(['--quantize', str(quantize)])
|
||||
elif bettertransformer: args.append('--bettertransformer')
|
||||
if cors: args.append('--cors')
|
||||
if adapter_map:
|
||||
args.extend(
|
||||
@@ -134,12 +123,10 @@ def _build(model_name: str,
|
||||
model_version: str | None = None,
|
||||
bento_version: str | None = None,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
bettertransformer: bool | None = None,
|
||||
adapter_map: dict[str, str | None] | None = None,
|
||||
build_ctx: str | None = None,
|
||||
enable_features: tuple[str, ...] | None = None,
|
||||
workers_per_resource: float | None = None,
|
||||
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
|
||||
dockerfile_template: str | None = None,
|
||||
overwrite: bool = False,
|
||||
container_registry: LiteralContainerRegistry | None = None,
|
||||
@@ -153,59 +140,50 @@ def _build(model_name: str,
|
||||
|
||||
The LLM will be built into a BentoService with the following structure:
|
||||
if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
|
||||
if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.
|
||||
|
||||
``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
|
||||
|
||||
> [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
|
||||
|
||||
Args:
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_version: Optional model version for this given LLM
|
||||
bento_version: Optional bento veresion for this given BentoLLM
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
bettertransformer: Convert given model to FastTransformer with PyTorch.
|
||||
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
|
||||
build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
|
||||
enable_features: Additional OpenLLM features to be included with this BentoLLM.
|
||||
workers_per_resource: Number of workers per resource assigned.
|
||||
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
|
||||
for more information. By default, this is set to 1.
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_version: Optional model version for this given LLM
|
||||
bento_version: Optional bento veresion for this given BentoLLM
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
|
||||
build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
|
||||
enable_features: Additional OpenLLM features to be included with this BentoLLM.
|
||||
workers_per_resource: Number of workers per resource assigned.
|
||||
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
|
||||
for more information. By default, this is set to 1.
|
||||
|
||||
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
|
||||
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
> - ``conserved``: This will determine the number of available GPU resources, and only assign
|
||||
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
|
||||
> equivalent to ``--workers-per-resource 0.25``.
|
||||
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
|
||||
dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
|
||||
overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
|
||||
push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
|
||||
containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
|
||||
Note that 'containerize' and 'push' are mutually exclusive
|
||||
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
|
||||
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
|
||||
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
|
||||
serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
|
||||
additional_args: Additional arguments to pass to ``openllm build``.
|
||||
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
|
||||
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
|
||||
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
|
||||
> - ``conserved``: This will determine the number of available GPU resources, and only assign
|
||||
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
|
||||
> equivalent to ``--workers-per-resource 0.25``.
|
||||
dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
|
||||
overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
|
||||
push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
|
||||
containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
|
||||
Note that 'containerize' and 'push' are mutually exclusive
|
||||
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
|
||||
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
|
||||
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
|
||||
serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
|
||||
additional_args: Additional arguments to pass to ``openllm build``.
|
||||
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
|
||||
|
||||
Returns:
|
||||
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
args: list[str] = [
|
||||
sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
|
||||
serialisation_format
|
||||
sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format
|
||||
]
|
||||
if quantize and bettertransformer:
|
||||
raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
|
||||
if quantize: args.extend(['--quantize', quantize])
|
||||
if bettertransformer: args.append('--bettertransformer')
|
||||
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
|
||||
if push: args.extend(['--push'])
|
||||
if containerize: args.extend(['--containerize'])
|
||||
@@ -241,8 +219,7 @@ def _import_model(model_name: str,
|
||||
*,
|
||||
model_id: str | None = None,
|
||||
model_version: str | None = None,
|
||||
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
|
||||
implementation: LiteralRuntime = 'pt',
|
||||
backend: LiteralBackend = 'pt',
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
|
||||
serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
|
||||
additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
|
||||
@@ -259,28 +236,24 @@ def _import_model(model_name: str,
|
||||
> ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
|
||||
|
||||
Args:
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_version: Optional model version for this given LLM
|
||||
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
|
||||
implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
||||
Default behaviour is similar to ``safe_serialization=False``.
|
||||
additional_args: Additional arguments to pass to ``openllm import``.
|
||||
model_name: The model name to start this LLM
|
||||
model_id: Optional model id for this given LLM
|
||||
model_version: Optional model version for this given LLM
|
||||
backend: The backend to use for this LLM. By default, this is set to ``pt``.
|
||||
quantize: Quantize the model weights. This is only applicable for PyTorch models.
|
||||
Possible quantisation strategies:
|
||||
- int8: Quantize the model with 8bit (bitsandbytes required)
|
||||
- int4: Quantize the model with 4bit (bitsandbytes required)
|
||||
- gptq: Quantize the model with GPTQ (auto-gptq required)
|
||||
serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
|
||||
Default behaviour is similar to ``safe_serialization=False``.
|
||||
additional_args: Additional arguments to pass to ``openllm import``.
|
||||
|
||||
Returns:
|
||||
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
|
||||
"""
|
||||
from .entrypoint import import_command
|
||||
args = [
|
||||
model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
|
||||
serialisation_format,
|
||||
]
|
||||
args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
|
||||
if model_id is not None: args.append(model_id)
|
||||
if model_version is not None: args.extend(['--model-version', str(model_version)])
|
||||
if additional_args is not None: args.extend(additional_args)
|
||||
|
||||
@@ -66,7 +66,7 @@ from openllm.models.auto import AutoLLM
|
||||
from openllm.utils import infer_auto_class
|
||||
from openllm_core._typing_compat import Concatenate
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import LiteralRuntime
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
from openllm_core._typing_compat import LiteralString
|
||||
from openllm_core._typing_compat import ParamSpec
|
||||
from openllm_core._typing_compat import Self
|
||||
@@ -80,7 +80,6 @@ from openllm_core.utils import analytics
|
||||
from openllm_core.utils import bentoml_cattr
|
||||
from openllm_core.utils import compose
|
||||
from openllm_core.utils import configure_logging
|
||||
from openllm_core.utils import dantic
|
||||
from openllm_core.utils import first_not_none
|
||||
from openllm_core.utils import get_debug_mode
|
||||
from openllm_core.utils import get_quiet_mode
|
||||
@@ -94,15 +93,13 @@ from . import termui
|
||||
from ._factory import FC
|
||||
from ._factory import LiteralOutput
|
||||
from ._factory import _AnyCallable
|
||||
from ._factory import bettertransformer_option
|
||||
from ._factory import backend_option
|
||||
from ._factory import container_registry_option
|
||||
from ._factory import fast_option
|
||||
from ._factory import machine_option
|
||||
from ._factory import model_id_option
|
||||
from ._factory import model_name_argument
|
||||
from ._factory import model_version_option
|
||||
from ._factory import output_option
|
||||
from ._factory import parse_device_callback
|
||||
from ._factory import quantize_option
|
||||
from ._factory import serialisation_option
|
||||
from ._factory import start_command_factory
|
||||
@@ -205,21 +202,6 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
|
||||
return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper)
|
||||
|
||||
@staticmethod
|
||||
def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
|
||||
command_name = attrs.get('name', func.__name__)
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
|
||||
try:
|
||||
return func(*args, **attrs)
|
||||
except OpenLLMException as err:
|
||||
raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg='red')) from err
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
return wrapper
|
||||
|
||||
def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
|
||||
if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx):
|
||||
return t.cast('Extensions', extension_command).get_command(ctx, cmd_name)
|
||||
@@ -253,11 +235,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
|
||||
name = name.replace('_', '-')
|
||||
kwargs.setdefault('help', inspect.getdoc(f))
|
||||
kwargs.setdefault('name', name)
|
||||
wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs)
|
||||
wrapped = self.usage_tracking(self.common_params(f), self, **kwargs)
|
||||
|
||||
# move common parameters to end of the parameters list
|
||||
_memo = getattr(wrapped, '__click_params__', None)
|
||||
if _memo is None: raise RuntimeError('Click command not register correctly.')
|
||||
if _memo is None: raise ValueError('Click command not register correctly.')
|
||||
_object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
|
||||
# NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
|
||||
cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
|
||||
@@ -348,11 +330,10 @@ _start_mapping = {
|
||||
@click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
|
||||
@click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None)
|
||||
@model_version_option
|
||||
@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
|
||||
@output_option
|
||||
@quantize_option
|
||||
@machine_option
|
||||
@click.option('--implementation', type=click.Choice(['pt', 'tf', 'flax', 'vllm']), default=None, help='The implementation for saving this LLM.')
|
||||
@backend_option
|
||||
@serialisation_option
|
||||
def import_command(
|
||||
model_name: str,
|
||||
@@ -360,9 +341,8 @@ def import_command(
|
||||
converter: str | None,
|
||||
model_version: str | None,
|
||||
output: LiteralOutput,
|
||||
runtime: t.Literal['ggml', 'transformers'],
|
||||
machine: bool,
|
||||
implementation: LiteralRuntime | None,
|
||||
backend: LiteralBackend,
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
) -> bentoml.Model:
|
||||
@@ -415,45 +395,42 @@ def import_command(
|
||||
```bash
|
||||
$ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
|
||||
```
|
||||
|
||||
> [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
|
||||
"""
|
||||
llm_config = AutoConfig.for_model(model_name)
|
||||
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
|
||||
impl: LiteralRuntime = first_not_none(implementation, default=env['framework_value'])
|
||||
llm = infer_auto_class(impl).for_model(
|
||||
env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
|
||||
backend = first_not_none(backend, default=env['backend_value'])
|
||||
llm = infer_auto_class(backend).for_model(
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
|
||||
)
|
||||
_previously_saved = False
|
||||
try:
|
||||
_ref = serialisation.get(llm)
|
||||
_previously_saved = True
|
||||
except bentoml.exceptions.NotFound:
|
||||
except openllm.exceptions.OpenLLMException:
|
||||
if not machine and output == 'pretty':
|
||||
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
|
||||
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
|
||||
termui.echo(msg, fg='yellow', nl=True)
|
||||
_ref = serialisation.get(llm, auto_import=True)
|
||||
if impl == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
|
||||
if machine: return _ref
|
||||
elif output == 'pretty':
|
||||
if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg='yellow')
|
||||
if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for backend '{backend}': {_ref.tag!s}", nl=True, fg='yellow')
|
||||
else: termui.echo(f'Saved model: {_ref.tag}')
|
||||
elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'framework': impl, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
|
||||
elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'backend': backend, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
|
||||
else: termui.echo(_ref.tag)
|
||||
return _ref
|
||||
|
||||
@cli.command(context_settings={'token_normalize_func': inflection.underscore})
|
||||
@model_name_argument
|
||||
@model_id_option
|
||||
@output_option
|
||||
@machine_option
|
||||
@backend_option
|
||||
@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
|
||||
@workers_per_resource_option(factory=click, build=True)
|
||||
@click.option('--device', type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, help='Set the device', show_envvar=True)
|
||||
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')
|
||||
@quantize_option(factory=cog.optgroup, build=True)
|
||||
@bettertransformer_option(factory=cog.optgroup)
|
||||
@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
|
||||
@click.option(
|
||||
'--enable-features',
|
||||
multiple=True,
|
||||
@@ -476,7 +453,6 @@ def import_command(
|
||||
@click.option(
|
||||
'--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'"
|
||||
)
|
||||
@fast_option
|
||||
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')
|
||||
@cog.optgroup.option(
|
||||
'--containerize',
|
||||
@@ -496,21 +472,18 @@ def build_command(
|
||||
bento_version: str | None,
|
||||
overwrite: bool,
|
||||
output: LiteralOutput,
|
||||
runtime: t.Literal['ggml', 'transformers'],
|
||||
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
|
||||
enable_features: tuple[str, ...] | None,
|
||||
bettertransformer: bool | None,
|
||||
workers_per_resource: float | None,
|
||||
adapter_id: tuple[str, ...],
|
||||
build_ctx: str | None,
|
||||
backend: LiteralBackend,
|
||||
machine: bool,
|
||||
device: tuple[str, ...],
|
||||
model_version: str | None,
|
||||
dockerfile_template: t.TextIO | None,
|
||||
containerize: bool,
|
||||
push: bool,
|
||||
serialisation_format: t.Literal['safetensors', 'legacy'],
|
||||
fast: bool,
|
||||
container_registry: LiteralContainerRegistry,
|
||||
container_version_strategy: LiteralContainerVersionStrategy,
|
||||
force_push: bool,
|
||||
@@ -539,22 +512,21 @@ def build_command(
|
||||
_previously_built = False
|
||||
|
||||
llm_config = AutoConfig.for_model(model_name)
|
||||
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, quantize=quantize, bettertransformer=bettertransformer, runtime=runtime)
|
||||
env = EnvVarMixin(model_name, backend=backend, model_id=model_id, quantize=quantize)
|
||||
|
||||
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
|
||||
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
|
||||
try:
|
||||
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), env.runtime: str(env['runtime_value']), 'OPENLLM_SERIALIZATION': serialisation_format})
|
||||
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
|
||||
if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
|
||||
if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
|
||||
os.environ[env.bettertransformer] = str(env['bettertransformer_value'])
|
||||
|
||||
llm = infer_auto_class(env['framework_value']).for_model(
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
|
||||
llm = infer_auto_class(env['backend_value']).for_model(
|
||||
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
|
||||
)
|
||||
|
||||
labels = dict(llm.identifying_params)
|
||||
labels.update({'_type': llm.llm_type, '_framework': env['framework_value']})
|
||||
labels.update({'_type': llm.llm_type, '_framework': env['backend_value']})
|
||||
workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource'])
|
||||
|
||||
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
|
||||
@@ -603,10 +575,8 @@ def build_command(
|
||||
workers_per_resource=workers_per_resource,
|
||||
adapter_map=adapter_map,
|
||||
quantize=quantize,
|
||||
bettertransformer=bettertransformer,
|
||||
extra_dependencies=enable_features,
|
||||
dockerfile_template=dockerfile_template_path,
|
||||
runtime=runtime,
|
||||
container_registry=container_registry,
|
||||
container_version_strategy=container_version_strategy
|
||||
)
|
||||
@@ -632,16 +602,17 @@ def build_command(
|
||||
|
||||
if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
|
||||
elif containerize:
|
||||
backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
|
||||
container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
|
||||
try:
|
||||
bentoml.container.health(backend)
|
||||
bentoml.container.health(container_backend)
|
||||
except subprocess.CalledProcessError:
|
||||
raise OpenLLMException(f'Failed to use backend {backend}') from None
|
||||
try:
|
||||
bentoml.container.build(bento.tag, backend=backend, features=('grpc', 'io'))
|
||||
bentoml.container.build(bento.tag, backend=container_backend, features=('grpc', 'io'))
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
|
||||
return bento
|
||||
|
||||
@cli.command()
|
||||
@output_option
|
||||
@click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
|
||||
@@ -667,21 +638,21 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
else:
|
||||
failed_initialized: list[tuple[str, Exception]] = []
|
||||
|
||||
json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'runtime_impl'], t.Any] | t.Any] = {}
|
||||
json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'backend'], t.Any] | t.Any] = {}
|
||||
converted: list[str] = []
|
||||
for m in models:
|
||||
config = AutoConfig.for_model(m)
|
||||
runtime_impl: tuple[str, ...] = ()
|
||||
if config['model_name'] in MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
|
||||
if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
|
||||
if config['model_name'] in MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
|
||||
if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ('vllm',)
|
||||
backend: tuple[str, ...] = ()
|
||||
if config['model_name'] in MODEL_MAPPING_NAMES: backend += ('pt',)
|
||||
if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: backend += ('flax',)
|
||||
if config['model_name'] in MODEL_TF_MAPPING_NAMES: backend += ('tf',)
|
||||
if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: backend += ('vllm',)
|
||||
json_data[m] = {
|
||||
'architecture': config['architecture'],
|
||||
'model_id': config['model_ids'],
|
||||
'cpu': not config['requires_gpu'],
|
||||
'gpu': True,
|
||||
'runtime_impl': runtime_impl,
|
||||
'backend': backend,
|
||||
'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
|
||||
}
|
||||
converted.extend([normalise_model_name(i) for i in config['model_ids']])
|
||||
@@ -708,10 +679,10 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
|
||||
import tabulate
|
||||
|
||||
tabulate.PRESERVE_WHITESPACE = True
|
||||
# llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
|
||||
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
|
||||
# llm, architecture, url, model_id, installation, cpu, gpu, backend
|
||||
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
|
||||
for m, v in json_data.items():
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['runtime_impl'],)])
|
||||
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '❌' if not v['cpu'] else '✅', '✅', v['backend'],)])
|
||||
column_widths = [
|
||||
int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
|
||||
]
|
||||
|
||||
@@ -18,7 +18,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
|
||||
prompt,
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
|
||||
|
||||
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
|
||||
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
embeddings: list[list[float]] = []
|
||||
@@ -30,4 +30,4 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
|
||||
data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0)
|
||||
embeddings.append(data.tolist())
|
||||
num_tokens += len(input_ids[0])
|
||||
return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
|
||||
return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)
|
||||
|
||||
@@ -17,7 +17,7 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
|
||||
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
|
||||
skip_special_tokens=True)
|
||||
|
||||
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
|
||||
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
embeddings: list[list[float]] = []
|
||||
@@ -29,4 +29,4 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
|
||||
data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0)
|
||||
embeddings.append(data.tolist())
|
||||
num_tokens += len(input_ids[0])
|
||||
return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
|
||||
return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)
|
||||
|
||||
@@ -13,7 +13,7 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
|
||||
import torch
|
||||
return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
|
||||
|
||||
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
|
||||
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
encoding = self.tokenizer(prompts, padding=True, return_tensors='pt').to(self.device)
|
||||
@@ -23,8 +23,8 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
|
||||
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
|
||||
masked_embeddings = data * mask
|
||||
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
|
||||
return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
|
||||
num_tokens=int(torch.sum(attention_mask).item()))
|
||||
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
|
||||
num_tokens=int(torch.sum(attention_mask).item()))
|
||||
|
||||
def generate_one(self, prompt: str, stop: list[str],
|
||||
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
|
||||
|
||||
@@ -33,10 +33,6 @@ def get_mpt_config(model_id_or_path: str,
|
||||
class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self) -> None:
|
||||
import torch
|
||||
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
||||
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
import torch
|
||||
@@ -49,7 +45,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
|
||||
import torch
|
||||
import transformers
|
||||
_, tokenizer_attrs = self.llm_parameters
|
||||
torch_dtype = attrs.pop('torch_dtype', self.dtype)
|
||||
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
|
||||
device_map = attrs.pop('device_map', None)
|
||||
attrs.pop('low_cpu_mem_usage', None)
|
||||
config = get_mpt_config(self.model_id,
|
||||
@@ -75,7 +71,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
|
||||
|
||||
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
|
||||
import transformers
|
||||
torch_dtype = attrs.pop('torch_dtype', self.dtype)
|
||||
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
|
||||
device_map = attrs.pop('device_map', None)
|
||||
trust_remote_code = attrs.pop('trust_remote_code', True)
|
||||
config = get_mpt_config(self._bentomodel.path,
|
||||
|
||||
@@ -8,10 +8,6 @@ if t.TYPE_CHECKING:
|
||||
class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
|
||||
__openllm_internal__ = True
|
||||
|
||||
def llm_post_init(self) -> None:
|
||||
import torch
|
||||
self.bettertransformer = True if not torch.cuda.is_available() else False
|
||||
|
||||
@property
|
||||
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
|
||||
import torch
|
||||
|
||||
@@ -1,27 +1,9 @@
|
||||
"""Serialisation utilities for OpenLLM.
|
||||
'''Serialisation utilities for OpenLLM.
|
||||
|
||||
Currently supports transformers for PyTorch, Tensorflow and Flax.
|
||||
|
||||
Currently, GGML format is working in progress.
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
import openllm
|
||||
|
||||
llm = openllm.AutoLLM.for_model("dolly-v2")
|
||||
llm.save_pretrained("./path/to/local-dolly")
|
||||
```
|
||||
|
||||
To use different runtime, specify directly in the `for_model` method:
|
||||
|
||||
```python
|
||||
import openllm
|
||||
|
||||
llm = openllm.AutoLLM.for_model("dolly-v2", runtime='ggml')
|
||||
llm.save_pretrained("./path/to/local-dolly")
|
||||
```
|
||||
"""
|
||||
'''
|
||||
from __future__ import annotations
|
||||
import importlib
|
||||
import typing as t
|
||||
@@ -54,7 +36,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
|
||||
from .transformers._helpers import infer_tokenizers_from_llm
|
||||
from .transformers._helpers import process_config
|
||||
|
||||
config, *_ = process_config(llm._bentomodel.path, llm.__llm_trust_remote_code__)
|
||||
config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
|
||||
bentomodel_fs = fs.open_fs(llm._bentomodel.path)
|
||||
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
|
||||
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
|
||||
@@ -62,12 +44,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
|
||||
tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
|
||||
except KeyError:
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
"Bento model does not have tokenizer. Make sure to save"
|
||||
" the tokenizer within the model via 'custom_objects'."
|
||||
" For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
|
||||
"Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
|
||||
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
|
||||
else:
|
||||
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
**tokenizer_attrs)
|
||||
|
||||
if tokenizer.pad_token_id is None:
|
||||
@@ -82,18 +63,20 @@ class _Caller(t.Protocol[P]):
|
||||
def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
|
||||
...
|
||||
|
||||
_extras = ['get', 'import_model', 'save_pretrained', 'load_model']
|
||||
_extras = ['get', 'import_model', 'load_model']
|
||||
|
||||
def _make_dispatch_function(fn: str) -> _Caller[P]:
|
||||
|
||||
def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
|
||||
"""Generic function dispatch to correct serialisation submodules based on LLM runtime.
|
||||
|
||||
> [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"'
|
||||
> [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "tf", "flax", "vllm")'
|
||||
|
||||
> [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"'
|
||||
> [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
|
||||
"""
|
||||
return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs)
|
||||
serde = 'transformers'
|
||||
if llm.__llm_backend__ == 'ggml': serde = 'ggml'
|
||||
return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs)
|
||||
|
||||
return caller
|
||||
|
||||
@@ -105,9 +88,6 @@ if t.TYPE_CHECKING:
|
||||
def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model:
|
||||
...
|
||||
|
||||
def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None:
|
||||
...
|
||||
|
||||
def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M:
|
||||
...
|
||||
|
||||
|
||||
@@ -5,10 +5,10 @@ This requires ctransformers to be installed.
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
from openllm_core._typing_compat import M
|
||||
|
||||
_conversion_strategy = {'pt': 'ggml'}
|
||||
@@ -21,30 +21,7 @@ def import_model(llm: openllm.LLM[t.Any, t.Any],
|
||||
raise NotImplementedError('Currently work in progress.')
|
||||
|
||||
def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
|
||||
'''Return an instance of ``bentoml.Model`` from given LLM instance.
|
||||
|
||||
By default, it will try to check the model in the local store.
|
||||
If model is not found, and ``auto_import`` is set to True, it will try to import the model from HuggingFace Hub.
|
||||
|
||||
Otherwise, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
'''
|
||||
try:
|
||||
model = bentoml.models.get(llm.tag)
|
||||
if model.info.module not in ('openllm.serialisation.ggml', __name__):
|
||||
raise bentoml.exceptions.NotFound(
|
||||
f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
|
||||
)
|
||||
if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
|
||||
return model
|
||||
except bentoml.exceptions.NotFound:
|
||||
if auto_import:
|
||||
return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
|
||||
raise
|
||||
raise NotImplementedError('Currently work in progress.')
|
||||
|
||||
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
raise NotImplementedError('Currently work in progress.')
|
||||
|
||||
def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None:
|
||||
raise NotImplementedError('Currently work in progress.')
|
||||
|
||||
@@ -5,6 +5,7 @@ import logging
|
||||
import typing as t
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from packaging.version import Version
|
||||
from simple_di import Provide
|
||||
from simple_di import inject
|
||||
|
||||
@@ -28,22 +29,18 @@ if t.TYPE_CHECKING:
|
||||
import auto_gptq as autogptq
|
||||
import torch
|
||||
import torch.nn
|
||||
import transformers
|
||||
import vllm
|
||||
|
||||
from bentoml._internal.models import ModelStore
|
||||
from openllm_core._typing_compat import DictStrAny
|
||||
from openllm_core._typing_compat import M
|
||||
from openllm_core._typing_compat import T
|
||||
else:
|
||||
vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
|
||||
autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
|
||||
transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')
|
||||
torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['import_model', 'get', 'load_model', 'save_pretrained']
|
||||
__all__ = ['import_model', 'get', 'load_model']
|
||||
|
||||
@inject
|
||||
def import_model(llm: openllm.LLM[M, T],
|
||||
@@ -74,7 +71,7 @@ def import_model(llm: openllm.LLM[M, T],
|
||||
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
|
||||
default=llm._serialisation_format == 'safetensors')
|
||||
# Disable safe serialization with vLLM
|
||||
if llm.__llm_implementation__ == 'vllm': safe_serialisation = False
|
||||
if llm.__llm_backend__ == 'vllm': safe_serialisation = False
|
||||
metadata: DictStrAny = {
|
||||
'safe_serialisation': safe_serialisation,
|
||||
'_quantize': quantize_method is not None and quantize_method
|
||||
@@ -95,8 +92,8 @@ def import_model(llm: openllm.LLM[M, T],
|
||||
# since saving int4 is not yet supported
|
||||
if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
|
||||
attrs.pop('quantization_config')
|
||||
if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation
|
||||
metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__
|
||||
if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
|
||||
metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
|
||||
|
||||
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
|
||||
trust_remote_code=trust_remote_code,
|
||||
@@ -108,7 +105,7 @@ def import_model(llm: openllm.LLM[M, T],
|
||||
imported_modules: list[types.ModuleType] = []
|
||||
bentomodel = bentoml.Model.create(llm.tag,
|
||||
module='openllm.serialisation.transformers',
|
||||
api_version='v1',
|
||||
api_version='v2',
|
||||
options=ModelOptions(),
|
||||
context=openllm.utils.generate_context(framework_name='openllm'),
|
||||
labels=openllm.utils.generate_labels(llm),
|
||||
@@ -133,8 +130,7 @@ def import_model(llm: openllm.LLM[M, T],
|
||||
trust_remote_code=trust_remote_code,
|
||||
use_safetensors=safe_serialisation,
|
||||
**hub_attrs,
|
||||
**attrs,
|
||||
)
|
||||
**attrs)
|
||||
update_model(bentomodel,
|
||||
metadata={
|
||||
'_pretrained_class': model.__class__.__name__,
|
||||
@@ -192,27 +188,21 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
|
||||
'''
|
||||
try:
|
||||
model = bentoml.models.get(llm.tag)
|
||||
if model.info.module not in ('openllm.serialisation.transformers'
|
||||
'bentoml.transformers', 'bentoml._internal.frameworks.transformers',
|
||||
__name__): # NOTE: backward compatible with previous version of OpenLLM.
|
||||
raise bentoml.exceptions.NotFound(
|
||||
f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
|
||||
)
|
||||
if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
|
||||
if Version(model.info.api_version) < Version('v2'):
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
|
||||
'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
|
||||
if model.info.labels['backend'] != llm.__llm_backend__:
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}."
|
||||
)
|
||||
return model
|
||||
except bentoml.exceptions.NotFound as err:
|
||||
if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
|
||||
raise err from None
|
||||
except Exception as err:
|
||||
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
|
||||
|
||||
def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
'''Load the model from BentoML store.
|
||||
|
||||
By default, it will try to find check the model in the local store.
|
||||
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
|
||||
'''
|
||||
config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
|
||||
config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
|
||||
safe_serialization = openllm.utils.first_not_none(t.cast(
|
||||
t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
|
||||
attrs.pop('safe_serialization', None),
|
||||
@@ -229,7 +219,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
*decls,
|
||||
quantize_config=t.cast('autogptq.BaseQuantizeConfig',
|
||||
llm.quantization_config),
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
use_safetensors=safe_serialization,
|
||||
**hub_attrs,
|
||||
**attrs)
|
||||
@@ -238,57 +228,9 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
|
||||
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
|
||||
*decls,
|
||||
config=config,
|
||||
trust_remote_code=llm.__llm_trust_remote_code__,
|
||||
trust_remote_code=llm.trust_remote_code,
|
||||
device_map=device_map,
|
||||
**hub_attrs,
|
||||
**attrs).eval()
|
||||
# BetterTransformer is currently only supported on PyTorch.
|
||||
if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
|
||||
if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
|
||||
if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
|
||||
return t.cast('M', model)
|
||||
|
||||
def save_pretrained(llm: openllm.LLM[M, T],
|
||||
save_directory: str,
|
||||
is_main_process: bool = True,
|
||||
state_dict: DictStrAny | None = None,
|
||||
save_function: t.Any | None = None,
|
||||
push_to_hub: bool = False,
|
||||
max_shard_size: int | str = '10GB',
|
||||
safe_serialization: bool = False,
|
||||
variant: str | None = None,
|
||||
**attrs: t.Any) -> None:
|
||||
save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
|
||||
model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
|
||||
safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors'
|
||||
# NOTE: disable safetensors for vllm
|
||||
if llm.__llm_implementation__ == 'vllm': safe_serialization = False
|
||||
if llm._quantize_method == 'gptq':
|
||||
if not openllm.utils.is_autogptq_available():
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
|
||||
)
|
||||
if llm.config['model_type'] != 'causal_lm':
|
||||
raise openllm.exceptions.OpenLLMException(
|
||||
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
|
||||
if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM):
|
||||
raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
|
||||
t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory,
|
||||
use_safetensors=safe_serialization)
|
||||
elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model):
|
||||
raise RuntimeError(
|
||||
"vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized."
|
||||
)
|
||||
elif isinstance(llm.model, transformers.Pipeline):
|
||||
llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
|
||||
else:
|
||||
# We can safely cast here since it will be the PreTrainedModel protocol.
|
||||
t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory,
|
||||
is_main_process=is_main_process,
|
||||
state_dict=state_dict,
|
||||
save_function=save_function,
|
||||
push_to_hub=push_to_hub,
|
||||
max_shard_size=max_shard_size,
|
||||
safe_serialization=safe_serialization,
|
||||
variant=variant,
|
||||
**model_save_attrs)
|
||||
llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
|
||||
|
||||
@@ -76,7 +76,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
|
||||
if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING: idx = 0
|
||||
elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1
|
||||
else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.')
|
||||
return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx])
|
||||
return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_backend__][idx])
|
||||
|
||||
def check_unintialised_params(model: torch.nn.Module) -> None:
|
||||
unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
|
||||
@@ -104,11 +104,11 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
|
||||
def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
|
||||
infer_fn: tuple[str, ...] = ('__call__',)
|
||||
default_config = ModelSignature(batchable=False)
|
||||
if llm.__llm_implementation__ in {'pt', 'vllm'}:
|
||||
if llm.__llm_backend__ in {'pt', 'vllm'}:
|
||||
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
|
||||
'group_beam_search', 'constrained_beam_search',
|
||||
)
|
||||
elif llm.__llm_implementation__ == 'tf':
|
||||
elif llm.__llm_backend__ == 'tf':
|
||||
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
|
||||
'contrastive_search',
|
||||
)
|
||||
|
||||
@@ -23,9 +23,9 @@ class HfIgnore:
|
||||
|
||||
@classmethod
|
||||
def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
|
||||
if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
|
||||
elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt]
|
||||
elif llm.__llm_implementation__ == 'flax':
|
||||
if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
|
||||
elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt]
|
||||
elif llm.__llm_backend__ == 'flax':
|
||||
base = [cls.tf, cls.pt, cls.safetensors] # as of current, safetensors is not supported with flax
|
||||
else:
|
||||
base = [cls.tf, cls.flax]
|
||||
|
||||
@@ -10,7 +10,7 @@ import bentoml
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralRuntime
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -18,10 +18,9 @@ logger = logging.getLogger(__name__)
|
||||
def build_bento(model: str,
|
||||
model_id: str | None = None,
|
||||
quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
|
||||
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
|
||||
cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
|
||||
logger.info('Building BentoML for %s', model)
|
||||
bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
|
||||
bento = openllm.build(model, model_id=model_id, quantize=quantize)
|
||||
yield bento
|
||||
if cleanup:
|
||||
logger.info('Deleting %s', bento.tag)
|
||||
@@ -49,7 +48,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag,
|
||||
@contextlib.contextmanager
|
||||
def prepare(model: str,
|
||||
model_id: str | None = None,
|
||||
implementation: LiteralRuntime = 'pt',
|
||||
implementation: LiteralBackend = 'pt',
|
||||
deployment_mode: t.Literal['container', 'local'] = 'local',
|
||||
clean_context: contextlib.ExitStack | None = None,
|
||||
cleanup: bool = True) -> t.Iterator[str]:
|
||||
|
||||
@@ -16,11 +16,11 @@ from . import dummy_vllm_objects as dummy_vllm_objects
|
||||
if t.TYPE_CHECKING:
|
||||
import openllm
|
||||
|
||||
from openllm_core._typing_compat import LiteralRuntime
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
|
||||
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
|
||||
return {
|
||||
'runtime': llm.runtime,
|
||||
'backend': llm.__llm_backend__,
|
||||
'framework': 'openllm',
|
||||
'model_name': llm.config['model_name'],
|
||||
'architecture': llm.config['architecture'],
|
||||
@@ -28,14 +28,13 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
|
||||
}
|
||||
|
||||
def infer_auto_class(
|
||||
implementation: LiteralRuntime
|
||||
) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
|
||||
backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
|
||||
import openllm
|
||||
if implementation == 'tf': return openllm.AutoTFLLM
|
||||
elif implementation == 'flax': return openllm.AutoFlaxLLM
|
||||
elif implementation == 'pt': return openllm.AutoLLM
|
||||
elif implementation == 'vllm': return openllm.AutoVLLM
|
||||
else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
|
||||
if backend == 'tf': return openllm.AutoTFLLM
|
||||
elif backend == 'flax': return openllm.AutoFlaxLLM
|
||||
elif backend == 'pt': return openllm.AutoLLM
|
||||
elif backend == 'vllm': return openllm.AutoVLLM
|
||||
else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')")
|
||||
|
||||
__all__ = [
|
||||
'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
|
||||
|
||||
@@ -30,12 +30,10 @@ def model_settings(draw: st.DrawFn):
|
||||
st.booleans(),
|
||||
'requirements':
|
||||
st.none() | st.lists(st.text(), min_size=1),
|
||||
'default_implementation':
|
||||
'default_backend':
|
||||
st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
|
||||
'model_type':
|
||||
st.sampled_from(['causal_lm', 'seq2seq_lm']),
|
||||
'runtime':
|
||||
st.sampled_from(['transformers', 'ggml']),
|
||||
'name_type':
|
||||
st.sampled_from(['dasherize', 'lowercase']),
|
||||
'timeout':
|
||||
|
||||
@@ -111,10 +111,7 @@ def patch_env(**attrs: t.Any):
|
||||
yield
|
||||
|
||||
def test_struct_envvar():
|
||||
with patch_env(**{
|
||||
field_env_key('env_llm', 'field1'): '4',
|
||||
field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',
|
||||
}):
|
||||
with patch_env(**{field_env_key('field1'): '4', field_env_key('temperature', suffix='generation'): '0.2',}):
|
||||
|
||||
class EnvLLM(openllm.LLMConfig):
|
||||
__config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
|
||||
@@ -146,8 +143,8 @@ def test_struct_provided_fields():
|
||||
|
||||
def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as mk:
|
||||
mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
|
||||
mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2))
|
||||
mk.setenv(field_env_key('field1'), str(4.0))
|
||||
mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
|
||||
sent = make_llm_config('OverwriteWithEnvAvailable', {
|
||||
'default_id': 'asdfasdf',
|
||||
'model_ids': ['asdf', 'asdfasdfads'],
|
||||
|
||||
@@ -8,9 +8,9 @@ import pytest
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from openllm_core._typing_compat import LiteralRuntime
|
||||
from openllm_core._typing_compat import LiteralBackend
|
||||
|
||||
_FRAMEWORK_MAPPING = {
|
||||
_MODELING_MAPPING = {
|
||||
'flan_t5': 'google/flan-t5-small',
|
||||
'opt': 'facebook/opt-125m',
|
||||
'baichuan': 'baichuan-inc/Baichuan-7B',
|
||||
@@ -22,19 +22,17 @@ _PROMPT_MAPPING = {
|
||||
|
||||
def parametrise_local_llm(
|
||||
model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
|
||||
if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
|
||||
runtime_impl: tuple[LiteralRuntime, ...] = tuple()
|
||||
if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
|
||||
if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
|
||||
if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
|
||||
for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
|
||||
llm = openllm.Runner(model,
|
||||
model_id=_FRAMEWORK_MAPPING[model],
|
||||
ensure_available=True,
|
||||
implementation=framework,
|
||||
init_local=True,
|
||||
)
|
||||
yield prompt, llm
|
||||
if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
|
||||
backends: tuple[LiteralBackend, ...] = tuple()
|
||||
if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',)
|
||||
if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',)
|
||||
if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',)
|
||||
for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()):
|
||||
yield prompt, openllm.Runner(model,
|
||||
model_id=_MODELING_MAPPING[model],
|
||||
ensure_available=True,
|
||||
backend=backend,
|
||||
init_local=True)
|
||||
|
||||
def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
|
||||
if os.getenv('GITHUB_ACTIONS') is None:
|
||||
|
||||
@@ -4,6 +4,7 @@ import os
|
||||
import typing as t
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
|
||||
import openllm
|
||||
|
||||
@@ -28,7 +29,7 @@ def test_general_build_with_internal_testing():
|
||||
bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
|
||||
|
||||
assert llm.llm_type == bento.info.labels['_type']
|
||||
assert llm.config['env']['framework_value'] == bento.info.labels['_framework']
|
||||
assert llm.config['env']['backend_value'] == bento.info.labels['_framework']
|
||||
|
||||
bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
|
||||
assert len(bento_store.list(bento.tag)) == 1
|
||||
@@ -38,10 +39,11 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
|
||||
local_path = tmp_path_factory.mktemp('local_t5')
|
||||
llm = openllm.AutoLLM.for_model('flan-t5', model_id=HF_INTERNAL_T5_TESTING, ensure_available=True)
|
||||
|
||||
if llm.bettertransformer:
|
||||
llm.__llm_model__ = llm.model.reverse_bettertransformer()
|
||||
|
||||
llm.save_pretrained(local_path)
|
||||
if isinstance(llm.model, transformers.Pipeline):
|
||||
llm.model.save_pretrained(str(local_path))
|
||||
else:
|
||||
llm.model.save_pretrained(str(local_path))
|
||||
llm.tokenizer.save_pretrained(str(local_path))
|
||||
|
||||
assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user