refactor(breaking): unify LLM API (#283)

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-09-01 05:15:19 -04:00
committed by GitHub
parent 35601dab20
commit 3e45530abd
50 changed files with 881 additions and 1232 deletions

View File

@@ -47,7 +47,7 @@ _import_structure: dict[str, list[str]] = {
"cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
"_quantisation": ["infer_quantisation_config"],
"_embeddings": ["GenericEmbeddingRunnable"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
"_generation": [
"StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
"prepare_logits_processor"
@@ -72,7 +72,7 @@ COMPILED = _Path(__file__).suffix in (".pyd", ".so")
if _t.TYPE_CHECKING:
from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
from ._llm import LLM as LLM, EmbeddingsOutput as EmbeddingsOutput, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
from ._quantisation import infer_quantisation_config as infer_quantisation_config
from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
@@ -196,7 +196,12 @@ else:
__lazy = openllm_core.utils.LazyModule(__name__,
globals()["__file__"],
_import_structure,
extra_objects={"COMPILED": COMPILED})
extra_objects={
"COMPILED": COMPILED,
"__openllm_migration__": {
"LLMEmbeddings": "EmbeddingsOutput"
}
})
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -0,0 +1,201 @@
'''LLM assignment magik.'''
from __future__ import annotations
import functools
import traceback
import typing as t
import openllm
from openllm.exceptions import OpenLLMException
from openllm_core._configuration import _object_getattribute
from openllm_core._configuration import _setattr_class
from openllm_core._schema import unmarshal_vllm_outputs
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import ListStr
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
from openllm_core._typing_compat import import_model_protocol
from openllm_core._typing_compat import llm_post_init_protocol
from openllm_core._typing_compat import load_model_protocol
from openllm_core._typing_compat import load_tokenizer_protocol
from openllm_core.utils import LazyLoader
from openllm_core.utils import codegen
from openllm_core.utils import device_count
from openllm_core.utils import first_not_none
from openllm_core.utils import is_torch_available
if t.TYPE_CHECKING:
import torch
import vllm
import bentoml
from openllm._llm import LLM
else:
torch = LazyLoader('torch', globals(), 'torch')
vllm = LazyLoader('vllm', globals(), 'vllm')
def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
(model_decls, model_attrs), _ = self.llm_parameters
decls = (*model_decls, *decls)
attrs = {**model_attrs, **attrs}
return fn(self, *decls, trust_remote_code=trust_remote_code, **attrs)
return inner
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
if self.__llm_backend__ == 'vllm':
# TODO: Do some more processing with token_id once we support token streaming
try:
return vllm.LLMEngine.from_engine_args(
vllm.EngineArgs(model=self._bentomodel.path,
tokenizer=self._bentomodel.path if self.tokenizer_id == 'local' else self.tokenizer_id,
tokenizer_mode='auto',
tensor_parallel_size=1 if device_count() < 2 else device_count(),
dtype='auto',
worker_use_ray=False))
except Exception as err:
traceback.print_exc()
raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from None
else:
(model_decls, model_attrs), _ = self.llm_parameters
return fn(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
return inner
def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]:
@functools.wraps(fn)
def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
return inner
def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]:
@functools.wraps(fn)
def inner(self: LLM[M, T]) -> None:
if self.__llm_backend__ == 'pt' and is_torch_available():
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fn(self)
return inner
def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
'''Make LLM attributes for the given LLM subclass.'''
from ._llm import LLM
from ._llm import LLMFunction
from ._llm import LLMInterface
from ._llm import LLMSerialisation
args: ListStr = []
globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM}
# _cached_LLMFunction_get and _ccached_LLMSerialisation_get
globs.update(
{f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
# llm_post_init implementation
lines: ListStr = [
f'_impl_{cls.__name__}_func=cls.llm_post_init',
_setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')
]
serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,}
for func, impl in serialisation_attr.items():
impl_name = f'__wrapped_{func}'
globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
cached_func_name = f'_cached_{cls.__name__}_func'
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
lines.extend([
f'{cached_func_name}=cls.{func}', func_call,
_setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')
])
# assign vLLM implementation
if cls.__llm_backend__ == 'vllm':
vllm_func = {
f'_vllm_{it}': fn
for it, fn in zip(('generate', 'generate_iterator',
'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
}
globs.update(vllm_func)
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
interface_anns = codegen.get_annotations(LLMInterface)
# cached attribute initialisation
def dunder_cached(key: str) -> str:
return f'__llm_{key}__'
st_attr = {'model', 'tokenizer', 'adapter_map'}
lines.extend([_setattr_class(dunder_cached(v), None) for v in st_attr])
# boolean for better LLM implementation resolver
def dunder_support(key: str) -> str:
return f'__llm_supports_{key}__'
bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
lines.extend(
[_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
return codegen.generate_function(cls,
'__assign_llm_attr',
lines,
args=('cls', *args),
globs=globs,
annotations={
'cls': 't.Type[LLM]',
'return': None
})
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
**_: t.Any) -> str:
return generation_result[0]['outputs'][0]['text']
def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
prompt: str,
/,
*,
echo: bool = False,
stop: str | t.Iterable[str] | None = None,
stop_token_ids: list[int] | None = None,
**attrs: t.Any) -> t.Iterator[dict[str, t.Any]]:
request_id: str | None = attrs.pop('request_id', None)
if request_id is None: raise ValueError('request_id must not be None.')
if stop_token_ids is None: stop_token_ids = []
stop_token_ids.append(self.tokenizer.eos_token_id)
stop_: set[str] = set()
if isinstance(stop, str) and stop != '': stop_.add(stop)
elif isinstance(stop, list) and stop != []: stop_.update(stop)
for tid in stop_token_ids:
if tid: stop_.add(self.tokenizer.decode(tid))
if self.config['temperature'] <= 1e-5: top_p = 1.0
else: top_p = self.config['top_p']
config = self.config.model_construct_env(stop=list(stop_), top_p=top_p, **attrs)
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=config.to_sampling_config())
while self.model.has_unfinished_requests():
for request_output in self.model.step():
prompt = request_output.prompt
if echo: text_outputs = [prompt + output.text for output in request_output.outputs]
else: text_outputs = [output.text for output in request_output.outputs]
yield {'text': text_outputs, 'error_code': 0}
if request_output.finished: break
def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
request_id: str | None = attrs.pop('request_id', None)
if request_id is None: raise ValueError('request_id must not be None.')
outputs: list[vllm.RequestOutput] = []
# TODO: support prompt_token_ids
self.model.add_request(request_id=request_id,
prompt=prompt,
sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
while self.model.has_unfinished_requests():
outputs.extend([r for r in self.model.step() if r.finished])
return [unmarshal_vllm_outputs(i) for i in outputs]

View File

@@ -58,7 +58,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
self.model.to(self.device)
@bentoml.Runnable.method(batchable=True, batch_dim=0)
def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]:
def encode(self, sentences: list[str]) -> t.Sequence[openllm.EmbeddingsOutput]:
import torch
import torch.nn.functional as F
encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device)
@@ -69,8 +69,8 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
# Perform pooling and normalize
sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
return [
openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(),
num_tokens=int(torch.sum(attention_mask).item()))
openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(),
num_tokens=int(torch.sum(attention_mask).item()))
]
@staticmethod

View File

File diff suppressed because it is too large Load Diff

View File

@@ -78,7 +78,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
'model_id': runner.llm.model_id,
'timeout': 3600,
'model_name': llm_config['model_name'],
'framework': runner.llm_framework,
'backend': runner.backend,
'configuration': '',
'supports_embeddings': runner.supports_embeddings,
'supports_hf_agent': runner.supports_hf_agent
@@ -86,7 +86,7 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
def metadata_v1(_: str) -> openllm.MetadataOutput:
return openllm.MetadataOutput(timeout=llm_config['timeout'],
model_name=llm_config['model_name'],
framework=llm_config['env']['framework_value'],
backend=llm_config['env']['backend_value'],
model_id=runner.llm.model_id,
configuration=llm_config.model_dump_json().decode(),
supports_embeddings=runner.supports_embeddings,

View File

@@ -86,17 +86,17 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
env = llm.config['env']
framework_envvar = env['framework_value']
if framework_envvar == 'flax':
backend_envvar = env['backend_value']
if backend_envvar == 'flax':
if not openllm_core.utils.is_flax_available():
raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'")
packages.extend(
[importlib.metadata.version('flax'),
importlib.metadata.version('jax'),
importlib.metadata.version('jaxlib')])
elif framework_envvar == 'tf':
elif backend_envvar == 'tf':
if not openllm_core.utils.is_tf_available():
raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
)
@@ -125,21 +125,22 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
return PythonOptions(packages=packages,
wheels=wheels,
lock_packages=False,
extra_index_url=['https://download.pytorch.org/whl/cu118'])
extra_index_url=[
'https://download.pytorch.org/whl/cu118',
'https://huggingface.github.io/autogptq-index/whl/cu118/'
])
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
quantize: LiteralString | None, bettertransformer: bool | None,
adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors',
'legacy'],
quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
env: openllm_core.utils.EnvVarMixin = llm.config['env']
if env['framework_value'] == 'vllm': serialisation_format = 'legacy'
if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
env_dict = {
env.framework: env['framework_value'],
env.backend: env['backend_value'],
env.config: f"'{llm.config.model_dump_json().decode()}'",
env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
'OPENLLM_MODEL': llm.config['model_name'],
@@ -152,14 +153,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')
# We need to handle None separately here, as env from subprocess doesn't accept None value.
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'],
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime)
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
env_dict[_env.runtime] = _env['runtime_value']
return DockerOptions(
base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
@@ -218,21 +214,19 @@ def create_bento(bento_tag: bentoml.Tag,
llm: openllm.LLM[t.Any, t.Any],
workers_per_resource: str | float,
quantize: LiteralString | None,
bettertransformer: bool | None,
dockerfile_template: str | None,
adapter_map: dict[str, str | None] | None = None,
extra_dependencies: tuple[str, ...] | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
container_registry: LiteralContainerRegistry = 'ecr',
container_version_strategy: LiteralContainerVersionStrategy = 'release',
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
_model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
framework_envvar = llm.config['env']['framework_value']
backend_envvar = llm.config['env']['backend_value']
labels = dict(llm.identifying_params)
labels.update({
'_type': llm.llm_type,
'_framework': framework_envvar,
'_framework': backend_envvar,
'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id,
'bundler': 'openllm.bundle'
@@ -265,8 +259,8 @@ def create_bento(bento_tag: bentoml.Tag,
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
bettertransformer, adapter_map, dockerfile_template,
runtime, serialisation_format, container_registry,
adapter_map, dockerfile_template,
serialisation_format, container_registry,
container_version_strategy))
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))

View File

@@ -94,7 +94,7 @@ class RefResolver:
git_hash: str = attr.field()
version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
strategy: LiteralContainerVersionStrategy = attr.field()
_ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
_ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
@classmethod
def _nightly_ref(cls) -> RefTuple:

View File

@@ -16,12 +16,15 @@ from click.shell_completion import CompletionItem
import bentoml
import openllm
import openllm_core
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm_core._typing_compat import Concatenate
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralString
from openllm_core._typing_compat import ParamSpec
from openllm_core._typing_compat import get_literal_args
from openllm_core.utils import DEBUG
from . import termui
@@ -147,14 +150,12 @@ Available official model_id(s): [default: {llm_config['default_id']}]
@click.pass_context
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
'legacy'],
cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend,
serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None,
return_process: bool, **attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env(
'OPENLLM_SERIALIZATION_WARNING'):
termui.echo(
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
fg='yellow')
@@ -184,20 +185,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
# Create a new model env to work with the envvar during CLI invocation
env = openllm.utils.EnvVarMixin(config['model_name'],
config.default_implementation(),
backend,
model_id=model_id or config['default_id'],
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime)
prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))
quantize=quantize)
requirements = llm_config['requirements']
if requirements is not None and len(requirements) > 0:
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
if len(missing_requirements) > 0:
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
# NOTE: This is to set current configuration
start_env = os.environ.copy()
start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
if fast:
termui.echo(
f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
fg='yellow')
start_env.update({
'OPENLLM_MODEL': model,
@@ -205,21 +204,18 @@ Available official model_id(s): [default: {llm_config['default_id']}]
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
'OPENLLM_SERIALIZATION': serialisation_format,
env.runtime: env['runtime_value'],
env.framework: env['framework_value']
env.backend: env['backend_value']
})
if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
# NOTE: quantize and bettertransformer value is already assigned within env
if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
model_id=start_env[env.model_id],
model_version=model_version,
llm_config=config,
ensure_available=not fast,
adapter_map=adapter_map,
serialisation=serialisation_format)
llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
model_id=start_env[env.model_id],
model_version=model_version,
llm_config=config,
ensure_available=True,
adapter_map=adapter_map,
serialisation=serialisation_format)
start_env.update({env.config: llm.config.model_dump_json().decode()})
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
@@ -268,21 +264,6 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
return noop
def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
if adapter_map and not openllm.utils.is_peft_available():
ctx.fail(
"Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
if quantize and llm_config.default_implementation() == 'vllm':
ctx.fail(
f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
)
requirements = llm_config['requirements']
if requirements is not None and len(requirements) > 0:
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
if len(missing_requirements) > 0:
termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
@@ -291,22 +272,21 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
cog.optgroup.group(
'General LLM Options',
help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
fast_option(factory=cog.optgroup),
backend_option(factory=cog.optgroup),
cog.optgroup.group('LLM Optimization Options',
help='''Optimization related options.
OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
The following are either in our roadmap or currently being worked on:
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
),
), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
cog.optgroup.option('--device',
type=openllm.utils.dantic.CUDA,
multiple=True,
@@ -314,13 +294,6 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
callback=parse_device_callback,
help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
show_envvar=True),
cog.optgroup.option('--runtime',
type=click.Choice(['ggml', 'transformers']),
default='transformers',
help='The runtime to use for the given model. Default is transformers.'),
quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
serialisation_option(factory=cog.optgroup),
cog.optgroup.group('Fine-tuning related options',
help='''\
Note that the argument `--adapter-id` can accept the following format:
@@ -439,18 +412,6 @@ def output_option(f: _AnyCallable | None = None,
shell_complete=complete_output_var,
**attrs)(f)
def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--fast/--no-fast',
show_default=True,
default=False,
envvar='OPENLLM_USE_LOCAL_LATEST',
show_envvar=True,
help='''Whether to skip checking if models is already in store.
This is useful if you already downloaded or setup the model beforehand.
''',
**attrs)(f)
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--cors/--no-cors',
show_default=True,
@@ -463,15 +424,12 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
def model_id_option(f: _AnyCallable | None = None,
*,
model_env: openllm.utils.EnvVarMixin | None = None,
**attrs: t.Any) -> t.Callable[[FC], FC]:
def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-id',
type=click.STRING,
default=None,
envvar=model_env.model_id if model_env is not None else None,
show_envvar=model_env is not None,
envvar='OPENLLM_MODEL_ID',
show_envvar=True,
help='Optional model_id name or path for (fine-tune) weight.',
**attrs)(f)
@@ -483,24 +441,31 @@ def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
**attrs)(f)
def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
# NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
# XXX: remove the check for __args__ once we have ggml and mlc supports
return cli_option('--backend',
type=click.Choice(get_literal_args(LiteralBackend)[:-2]),
default='pt',
envvar='OPENLLM_BACKEND',
show_envvar=True,
help='The implementation for saving this LLM.',
**attrs)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_argument('model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
required=required,
**attrs)(f)
def quantize_option(f: _AnyCallable | None = None,
*,
build: bool = False,
model_env: openllm.utils.EnvVarMixin | None = None,
**attrs: t.Any) -> t.Callable[[FC], FC]:
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--quantise',
'--quantize',
'quantize',
type=click.Choice(['int8', 'int4', 'gptq']),
default=None,
envvar=model_env.quantize if model_env is not None else None,
show_envvar=model_env is not None,
envvar='OPENLLM_QUANTIZE',
show_envvar=True,
help='''Dynamic quantization for running this LLM.
The following quantization strategies are supported:
@@ -542,21 +507,6 @@ def workers_per_resource_option(f: _AnyCallable | None = None,
> ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
**attrs)(f)
def bettertransformer_option(f: _AnyCallable | None = None,
*,
build: bool = False,
model_env: openllm.utils.EnvVarMixin | None = None,
**attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--bettertransformer',
is_flag=True,
default=None,
envvar=model_env.bettertransformer if model_env is not None else None,
show_envvar=model_env is not None,
help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
'Set default environment variable whether to serve this model with FasterTransformer in build time.',
**attrs)(f)
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--serialisation',
'--serialization',
@@ -586,22 +536,18 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
**attrs)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help='''The default container registry to get the base image for building BentoLLM.
Currently, it supports 'ecr', 'ghcr.io', 'docker.io'
\b
> [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
''',
**attrs)(f)
return cli_option(
'--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help=
'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
**attrs)(f)
_wpr_strategies = {'round_robin', 'conserved'}

View File

@@ -23,9 +23,9 @@ from ._factory import start_command_factory
if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
from openllm_core._configuration import LLMConfig
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralContainerRegistry
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralString
logger = logging.getLogger(__name__)
@@ -38,10 +38,8 @@ def _start(model_name: str,
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
device: tuple[str, ...] | t.Literal['all'] | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: bool | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
adapter_map: dict[LiteralString, str | None] | None = None,
framework: LiteralRuntime | None = None,
backend: LiteralBackend | None = None,
additional_args: list[str] | None = None,
cors: bool = False,
_serve_grpc: bool = False,
@@ -57,48 +55,42 @@ def _start(model_name: str,
``openllm.start`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as the CLI interaction.
> [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
Args:
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
timeout: The server timeout
workers_per_resource: Number of workers per resource assigned.
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
for more information. By default, this is set to 1.
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
timeout: The server timeout
workers_per_resource: Number of workers per resource assigned.
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
for more information. By default, this is set to 1.
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
> - ``conserved``: This will determine the number of available GPU resources, and only assign
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
> equivalent to ``--workers-per-resource 0.25``.
device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
argument to assign all available GPUs to this LLM.
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
bettertransformer: Convert given model to FastTransformer with PyTorch.
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
framework: The framework to use for this LLM. By default, this is set to ``pt``.
additional_args: Additional arguments to pass to ``openllm start``.
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
> - ``conserved``: This will determine the number of available GPU resources, and only assign
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
> equivalent to ``--workers-per-resource 0.25``.
device: Assign GPU devices (if available) to this LLM. By default, this is set to ``None``. It also accepts 'all'
argument to assign all available GPUs to this LLM.
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
cors: Whether to enable CORS for this LLM. By default, this is set to ``False``.
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
backend: The backend to use for this LLM. By default, this is set to ``pt``.
additional_args: Additional arguments to pass to ``openllm start``.
"""
from .entrypoint import start_command
from .entrypoint import start_grpc_command
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
openllm_core.utils.first_not_none(
framework, default=llm_config.default_implementation()),
backend=openllm_core.utils.first_not_none(
backend, default=llm_config.default_backend()),
model_id=model_id,
bettertransformer=bettertransformer,
quantize=quantize,
runtime=runtime)
os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']
quantize=quantize)
os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
args: list[str] = ['--runtime', runtime]
args: list[str] = []
if model_id: args.extend(['--model-id', model_id])
if timeout: args.extend(['--server-timeout', str(timeout)])
if workers_per_resource:
@@ -107,10 +99,7 @@ def _start(model_name: str,
str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
])
if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
if quantize and bettertransformer:
raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(['--quantize', str(quantize)])
elif bettertransformer: args.append('--bettertransformer')
if cors: args.append('--cors')
if adapter_map:
args.extend(
@@ -134,12 +123,10 @@ def _build(model_name: str,
model_version: str | None = None,
bento_version: str | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
bettertransformer: bool | None = None,
adapter_map: dict[str, str | None] | None = None,
build_ctx: str | None = None,
enable_features: tuple[str, ...] | None = None,
workers_per_resource: float | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
dockerfile_template: str | None = None,
overwrite: bool = False,
container_registry: LiteralContainerRegistry | None = None,
@@ -153,59 +140,50 @@ def _build(model_name: str,
The LLM will be built into a BentoService with the following structure:
if ``quantize`` is passed, it will instruct the model to be quantized dynamically during serving time.
if ``bettertransformer`` is passed, it will instruct the model to apply FasterTransformer during serving time.
``openllm.build`` will invoke ``click.Command`` under the hood, so it behaves exactly the same as ``openllm build`` CLI.
> [!NOTE] ``quantize`` and ``bettertransformer`` are mutually exclusive.
Args:
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
model_version: Optional model version for this given LLM
bento_version: Optional bento veresion for this given BentoLLM
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
bettertransformer: Convert given model to FastTransformer with PyTorch.
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
enable_features: Additional OpenLLM features to be included with this BentoLLM.
workers_per_resource: Number of workers per resource assigned.
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
for more information. By default, this is set to 1.
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
model_version: Optional model version for this given LLM
bento_version: Optional bento veresion for this given BentoLLM
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
adapter_map: The adapter mapping of LoRA to use for this LLM. It accepts a dictionary of ``{adapter_id: adapter_name}``.
build_ctx: The build context to use for building BentoLLM. By default, it sets to current directory.
enable_features: Additional OpenLLM features to be included with this BentoLLM.
workers_per_resource: Number of workers per resource assigned.
See [resource scheduling](https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy)
for more information. By default, this is set to 1.
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
> - ``conserved``: This will determine the number of available GPU resources, and only assign
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
> equivalent to ``--workers-per-resource 0.25``.
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
Note that 'containerize' and 'push' are mutually exclusive
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
additional_args: Additional arguments to pass to ``openllm build``.
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
> [!NOTE] ``--workers-per-resource`` will also accept the following strategies:
> - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.
> - ``conserved``: This will determine the number of available GPU resources, and only assign
> one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is
> equivalent to ``--workers-per-resource 0.25``.
dockerfile_template: The dockerfile template to use for building BentoLLM. See https://docs.bentoml.com/en/latest/guides/containerization.html#dockerfile-template.
overwrite: Whether to overwrite the existing BentoLLM. By default, this is set to ``False``.
push: Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.
containerize: Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.
Note that 'containerize' and 'push' are mutually exclusive
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
additional_args: Additional arguments to pass to ``openllm build``.
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
Returns:
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args: list[str] = [
sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
serialisation_format
sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format
]
if quantize and bettertransformer:
raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
if quantize: args.extend(['--quantize', quantize])
if bettertransformer: args.append('--bettertransformer')
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
if push: args.extend(['--push'])
if containerize: args.extend(['--containerize'])
@@ -241,8 +219,7 @@ def _import_model(model_name: str,
*,
model_id: str | None = None,
model_version: str | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
implementation: LiteralRuntime = 'pt',
backend: LiteralBackend = 'pt',
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
@@ -259,28 +236,24 @@ def _import_model(model_name: str,
> ``openllm.start`` will automatically invoke ``openllm.download`` under the hood.
Args:
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
model_version: Optional model version for this given LLM
runtime: The runtime to use for this LLM. By default, this is set to ``transformers``. In the future, this will include supports for GGML.
implementation: The implementation to use for this LLM. By default, this is set to ``pt``.
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
Default behaviour is similar to ``safe_serialization=False``.
additional_args: Additional arguments to pass to ``openllm import``.
model_name: The model name to start this LLM
model_id: Optional model id for this given LLM
model_version: Optional model version for this given LLM
backend: The backend to use for this LLM. By default, this is set to ``pt``.
quantize: Quantize the model weights. This is only applicable for PyTorch models.
Possible quantisation strategies:
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
Default behaviour is similar to ``safe_serialization=False``.
additional_args: Additional arguments to pass to ``openllm import``.
Returns:
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
from .entrypoint import import_command
args = [
model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
serialisation_format,
]
args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
if model_id is not None: args.append(model_id)
if model_version is not None: args.extend(['--model-version', str(model_version)])
if additional_args is not None: args.extend(additional_args)

View File

@@ -66,7 +66,7 @@ from openllm.models.auto import AutoLLM
from openllm.utils import infer_auto_class
from openllm_core._typing_compat import Concatenate
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralString
from openllm_core._typing_compat import ParamSpec
from openllm_core._typing_compat import Self
@@ -80,7 +80,6 @@ from openllm_core.utils import analytics
from openllm_core.utils import bentoml_cattr
from openllm_core.utils import compose
from openllm_core.utils import configure_logging
from openllm_core.utils import dantic
from openllm_core.utils import first_not_none
from openllm_core.utils import get_debug_mode
from openllm_core.utils import get_quiet_mode
@@ -94,15 +93,13 @@ from . import termui
from ._factory import FC
from ._factory import LiteralOutput
from ._factory import _AnyCallable
from ._factory import bettertransformer_option
from ._factory import backend_option
from ._factory import container_registry_option
from ._factory import fast_option
from ._factory import machine_option
from ._factory import model_id_option
from ._factory import model_name_argument
from ._factory import model_version_option
from ._factory import output_option
from ._factory import parse_device_callback
from ._factory import quantize_option
from ._factory import serialisation_option
from ._factory import start_command_factory
@@ -205,21 +202,6 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper)
@staticmethod
def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]:
command_name = attrs.get('name', func.__name__)
@functools.wraps(func)
def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any:
try:
return func(*args, **attrs)
except OpenLLMException as err:
raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg='red')) from err
except KeyboardInterrupt:
pass
return wrapper
def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
if cmd_name in t.cast('Extensions', extension_command).list_commands(ctx):
return t.cast('Extensions', extension_command).get_command(ctx, cmd_name)
@@ -253,11 +235,11 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
name = name.replace('_', '-')
kwargs.setdefault('help', inspect.getdoc(f))
kwargs.setdefault('name', name)
wrapped = self.exception_handling(self.usage_tracking(self.common_params(f), self, **kwargs), self, **kwargs)
wrapped = self.usage_tracking(self.common_params(f), self, **kwargs)
# move common parameters to end of the parameters list
_memo = getattr(wrapped, '__click_params__', None)
if _memo is None: raise RuntimeError('Click command not register correctly.')
if _memo is None: raise ValueError('Click command not register correctly.')
_object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS:] + _memo[:-self.NUMBER_OF_COMMON_PARAMS])
# NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
@@ -348,11 +330,10 @@ _start_mapping = {
@click.argument('model_id', type=click.STRING, default=None, metavar='Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=False)
@click.argument('converter', envvar='CONVERTER', type=click.STRING, default=None, required=False, metavar=None)
@model_version_option
@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
@output_option
@quantize_option
@machine_option
@click.option('--implementation', type=click.Choice(['pt', 'tf', 'flax', 'vllm']), default=None, help='The implementation for saving this LLM.')
@backend_option
@serialisation_option
def import_command(
model_name: str,
@@ -360,9 +341,8 @@ def import_command(
converter: str | None,
model_version: str | None,
output: LiteralOutput,
runtime: t.Literal['ggml', 'transformers'],
machine: bool,
implementation: LiteralRuntime | None,
backend: LiteralBackend,
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
serialisation_format: t.Literal['safetensors', 'legacy'],
) -> bentoml.Model:
@@ -415,45 +395,42 @@ def import_command(
```bash
$ CONVERTER=llama2-hf openllm import llama /path/to/llama-2
```
> [!WARNING] This behaviour will override ``--runtime``. Therefore make sure that the LLM contains correct conversion strategies to both GGML and HF.
"""
llm_config = AutoConfig.for_model(model_name)
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize)
impl: LiteralRuntime = first_not_none(implementation, default=env['framework_value'])
llm = infer_auto_class(impl).for_model(
env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
backend = first_not_none(backend, default=env['backend_value'])
llm = infer_auto_class(backend).for_model(
model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
)
_previously_saved = False
try:
_ref = serialisation.get(llm)
_previously_saved = True
except bentoml.exceptions.NotFound:
except openllm.exceptions.OpenLLMException:
if not machine and output == 'pretty':
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for implementation {llm.__llm_implementation__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
termui.echo(msg, fg='yellow', nl=True)
_ref = serialisation.get(llm, auto_import=True)
if impl == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
if machine: return _ref
elif output == 'pretty':
if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for framework '{impl}': {_ref.tag!s}", nl=True, fg='yellow')
if _previously_saved: termui.echo(f"{model_name} with 'model_id={model_id}' is already setup for backend '{backend}': {_ref.tag!s}", nl=True, fg='yellow')
else: termui.echo(f'Saved model: {_ref.tag}')
elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'framework': impl, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
elif output == 'json': termui.echo(orjson.dumps({'previously_setup': _previously_saved, 'backend': backend, 'tag': str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode())
else: termui.echo(_ref.tag)
return _ref
@cli.command(context_settings={'token_normalize_func': inflection.underscore})
@model_name_argument
@model_id_option
@output_option
@machine_option
@backend_option
@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@workers_per_resource_option(factory=click, build=True)
@click.option('--device', type=dantic.CUDA, multiple=True, envvar='CUDA_VISIBLE_DEVICES', callback=parse_device_callback, help='Set the device', show_envvar=True)
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Optimisation options')
@quantize_option(factory=cog.optgroup, build=True)
@bettertransformer_option(factory=cog.optgroup)
@click.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.')
@click.option(
'--enable-features',
multiple=True,
@@ -476,7 +453,6 @@ def import_command(
@click.option(
'--container-version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='release', help="Default container version strategy for the image from '--container-registry'"
)
@fast_option
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')
@cog.optgroup.option(
'--containerize',
@@ -496,21 +472,18 @@ def build_command(
bento_version: str | None,
overwrite: bool,
output: LiteralOutput,
runtime: t.Literal['ggml', 'transformers'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
enable_features: tuple[str, ...] | None,
bettertransformer: bool | None,
workers_per_resource: float | None,
adapter_id: tuple[str, ...],
build_ctx: str | None,
backend: LiteralBackend,
machine: bool,
device: tuple[str, ...],
model_version: str | None,
dockerfile_template: t.TextIO | None,
containerize: bool,
push: bool,
serialisation_format: t.Literal['safetensors', 'legacy'],
fast: bool,
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
force_push: bool,
@@ -539,22 +512,21 @@ def build_command(
_previously_built = False
llm_config = AutoConfig.for_model(model_name)
env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, quantize=quantize, bettertransformer=bettertransformer, runtime=runtime)
env = EnvVarMixin(model_name, backend=backend, model_id=model_id, quantize=quantize)
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
try:
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), env.runtime: str(env['runtime_value']), 'OPENLLM_SERIALIZATION': serialisation_format})
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
os.environ[env.bettertransformer] = str(env['bettertransformer_value'])
llm = infer_auto_class(env['framework_value']).for_model(
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs
llm = infer_auto_class(env['backend_value']).for_model(
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
)
labels = dict(llm.identifying_params)
labels.update({'_type': llm.llm_type, '_framework': env['framework_value']})
labels.update({'_type': llm.llm_type, '_framework': env['backend_value']})
workers_per_resource = first_not_none(workers_per_resource, default=llm_config['workers_per_resource'])
with fs.open_fs(f"temp://llm_{llm_config['model_name']}") as llm_fs:
@@ -603,10 +575,8 @@ def build_command(
workers_per_resource=workers_per_resource,
adapter_map=adapter_map,
quantize=quantize,
bettertransformer=bettertransformer,
extra_dependencies=enable_features,
dockerfile_template=dockerfile_template_path,
runtime=runtime,
container_registry=container_registry,
container_version_strategy=container_version_strategy
)
@@ -632,16 +602,17 @@ def build_command(
if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
elif containerize:
backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
try:
bentoml.container.health(backend)
bentoml.container.health(container_backend)
except subprocess.CalledProcessError:
raise OpenLLMException(f'Failed to use backend {backend}') from None
try:
bentoml.container.build(bento.tag, backend=backend, features=('grpc', 'io'))
bentoml.container.build(bento.tag, backend=container_backend, features=('grpc', 'io'))
except Exception as err:
raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
return bento
@cli.command()
@output_option
@click.option('--show-available', is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').")
@@ -667,21 +638,21 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
else:
failed_initialized: list[tuple[str, Exception]] = []
json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'runtime_impl'], t.Any] | t.Any] = {}
json_data: dict[str, dict[t.Literal['architecture', 'model_id', 'url', 'installation', 'cpu', 'gpu', 'backend'], t.Any] | t.Any] = {}
converted: list[str] = []
for m in models:
config = AutoConfig.for_model(m)
runtime_impl: tuple[str, ...] = ()
if config['model_name'] in MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
if config['model_name'] in MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ('vllm',)
backend: tuple[str, ...] = ()
if config['model_name'] in MODEL_MAPPING_NAMES: backend += ('pt',)
if config['model_name'] in MODEL_FLAX_MAPPING_NAMES: backend += ('flax',)
if config['model_name'] in MODEL_TF_MAPPING_NAMES: backend += ('tf',)
if config['model_name'] in MODEL_VLLM_MAPPING_NAMES: backend += ('vllm',)
json_data[m] = {
'architecture': config['architecture'],
'model_id': config['model_ids'],
'cpu': not config['requires_gpu'],
'gpu': True,
'runtime_impl': runtime_impl,
'backend': backend,
'installation': f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config['requirements'] else 'openllm',
}
converted.extend([normalise_model_name(i) for i in config['model_ids']])
@@ -708,10 +679,10 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo
import tabulate
tabulate.PRESERVE_WHITESPACE = True
# llm, architecture, url, model_id, installation, cpu, gpu, runtime_impl
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = []
# llm, architecture, url, model_id, installation, cpu, gpu, backend
data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralBackend, ...]]] = []
for m, v in json_data.items():
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '' if not v['cpu'] else '', '', v['runtime_impl'],)])
data.extend([(m, v['architecture'], v['model_id'], v['installation'], '' if not v['cpu'] else '', '', v['backend'],)])
column_widths = [
int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),
]

View File

@@ -18,7 +18,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
prompt,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
import torch
import torch.nn.functional as F
embeddings: list[list[float]] = []
@@ -30,4 +30,4 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0)
embeddings.append(data.tolist())
num_tokens += len(input_ids[0])
return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)

View File

@@ -17,7 +17,7 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
import torch
import torch.nn.functional as F
embeddings: list[list[float]] = []
@@ -29,4 +29,4 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0)
embeddings.append(data.tolist())
num_tokens += len(input_ids[0])
return openllm.LLMEmbeddings(embeddings=embeddings, num_tokens=num_tokens)
return openllm.EmbeddingsOutput(embeddings=embeddings, num_tokens=num_tokens)

View File

@@ -13,7 +13,7 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
import torch
return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
import torch
import torch.nn.functional as F
encoding = self.tokenizer(prompts, padding=True, return_tensors='pt').to(self.device)
@@ -23,8 +23,8 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
masked_embeddings = data * mask
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
num_tokens=int(torch.sum(attention_mask).item()))
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
num_tokens=int(torch.sum(attention_mask).item()))
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:

View File

@@ -33,10 +33,6 @@ def get_mpt_config(model_id_or_path: str,
class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
__openllm_internal__ = True
def llm_post_init(self) -> None:
import torch
self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
@@ -49,7 +45,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
import torch
import transformers
_, tokenizer_attrs = self.llm_parameters
torch_dtype = attrs.pop('torch_dtype', self.dtype)
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
device_map = attrs.pop('device_map', None)
attrs.pop('low_cpu_mem_usage', None)
config = get_mpt_config(self.model_id,
@@ -75,7 +71,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
import transformers
torch_dtype = attrs.pop('torch_dtype', self.dtype)
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
device_map = attrs.pop('device_map', None)
trust_remote_code = attrs.pop('trust_remote_code', True)
config = get_mpt_config(self._bentomodel.path,

View File

@@ -8,10 +8,6 @@ if t.TYPE_CHECKING:
class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
__openllm_internal__ = True
def llm_post_init(self) -> None:
import torch
self.bettertransformer = True if not torch.cuda.is_available() else False
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch

View File

@@ -1,27 +1,9 @@
"""Serialisation utilities for OpenLLM.
'''Serialisation utilities for OpenLLM.
Currently supports transformers for PyTorch, Tensorflow and Flax.
Currently, GGML format is working in progress.
## Usage
```python
import openllm
llm = openllm.AutoLLM.for_model("dolly-v2")
llm.save_pretrained("./path/to/local-dolly")
```
To use different runtime, specify directly in the `for_model` method:
```python
import openllm
llm = openllm.AutoLLM.for_model("dolly-v2", runtime='ggml')
llm.save_pretrained("./path/to/local-dolly")
```
"""
'''
from __future__ import annotations
import importlib
import typing as t
@@ -54,7 +36,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
from .transformers._helpers import infer_tokenizers_from_llm
from .transformers._helpers import process_config
config, *_ = process_config(llm._bentomodel.path, llm.__llm_trust_remote_code__)
config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
bentomodel_fs = fs.open_fs(llm._bentomodel.path)
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:
@@ -62,12 +44,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
except KeyError:
raise openllm.exceptions.OpenLLMException(
"Bento model does not have tokenizer. Make sure to save"
" the tokenizer within the model via 'custom_objects'."
" For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
"Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
else:
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
trust_remote_code=llm.__llm_trust_remote_code__,
trust_remote_code=llm.trust_remote_code,
**tokenizer_attrs)
if tokenizer.pad_token_id is None:
@@ -82,18 +63,20 @@ class _Caller(t.Protocol[P]):
def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
...
_extras = ['get', 'import_model', 'save_pretrained', 'load_model']
_extras = ['get', 'import_model', 'load_model']
def _make_dispatch_function(fn: str) -> _Caller[P]:
def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
"""Generic function dispatch to correct serialisation submodules based on LLM runtime.
> [!NOTE] See 'openllm.serialisation.transformers' if 'llm.runtime="transformers"'
> [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "tf", "flax", "vllm")'
> [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"'
> [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
"""
return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs)
serde = 'transformers'
if llm.__llm_backend__ == 'ggml': serde = 'ggml'
return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs)
return caller
@@ -105,9 +88,6 @@ if t.TYPE_CHECKING:
def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model:
...
def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None:
...
def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M:
...

View File

@@ -5,10 +5,10 @@ This requires ctransformers to be installed.
from __future__ import annotations
import typing as t
import bentoml
import openllm
if t.TYPE_CHECKING:
import bentoml
import openllm
from openllm_core._typing_compat import M
_conversion_strategy = {'pt': 'ggml'}
@@ -21,30 +21,7 @@ def import_model(llm: openllm.LLM[t.Any, t.Any],
raise NotImplementedError('Currently work in progress.')
def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
'''Return an instance of ``bentoml.Model`` from given LLM instance.
By default, it will try to check the model in the local store.
If model is not found, and ``auto_import`` is set to True, it will try to import the model from HuggingFace Hub.
Otherwise, it will raises a ``bentoml.exceptions.NotFound``.
'''
try:
model = bentoml.models.get(llm.tag)
if model.info.module not in ('openllm.serialisation.ggml', __name__):
raise bentoml.exceptions.NotFound(
f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
)
if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
return model
except bentoml.exceptions.NotFound:
if auto_import:
return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
raise
raise NotImplementedError('Currently work in progress.')
def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
raise NotImplementedError('Currently work in progress.')
def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None:
raise NotImplementedError('Currently work in progress.')

View File

@@ -5,6 +5,7 @@ import logging
import typing as t
from huggingface_hub import snapshot_download
from packaging.version import Version
from simple_di import Provide
from simple_di import inject
@@ -28,22 +29,18 @@ if t.TYPE_CHECKING:
import auto_gptq as autogptq
import torch
import torch.nn
import transformers
import vllm
from bentoml._internal.models import ModelStore
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
else:
vllm = openllm.utils.LazyLoader('vllm', globals(), 'vllm')
autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')
torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
logger = logging.getLogger(__name__)
__all__ = ['import_model', 'get', 'load_model', 'save_pretrained']
__all__ = ['import_model', 'get', 'load_model']
@inject
def import_model(llm: openllm.LLM[M, T],
@@ -74,7 +71,7 @@ def import_model(llm: openllm.LLM[M, T],
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
default=llm._serialisation_format == 'safetensors')
# Disable safe serialization with vLLM
if llm.__llm_implementation__ == 'vllm': safe_serialisation = False
if llm.__llm_backend__ == 'vllm': safe_serialisation = False
metadata: DictStrAny = {
'safe_serialisation': safe_serialisation,
'_quantize': quantize_method is not None and quantize_method
@@ -95,8 +92,8 @@ def import_model(llm: openllm.LLM[M, T],
# since saving int4 is not yet supported
if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
attrs.pop('quantization_config')
if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation
metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__
if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
trust_remote_code=trust_remote_code,
@@ -108,7 +105,7 @@ def import_model(llm: openllm.LLM[M, T],
imported_modules: list[types.ModuleType] = []
bentomodel = bentoml.Model.create(llm.tag,
module='openllm.serialisation.transformers',
api_version='v1',
api_version='v2',
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='openllm'),
labels=openllm.utils.generate_labels(llm),
@@ -133,8 +130,7 @@ def import_model(llm: openllm.LLM[M, T],
trust_remote_code=trust_remote_code,
use_safetensors=safe_serialisation,
**hub_attrs,
**attrs,
)
**attrs)
update_model(bentomodel,
metadata={
'_pretrained_class': model.__class__.__name__,
@@ -192,27 +188,21 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
'''
try:
model = bentoml.models.get(llm.tag)
if model.info.module not in ('openllm.serialisation.transformers'
'bentoml.transformers', 'bentoml._internal.frameworks.transformers',
__name__): # NOTE: backward compatible with previous version of OpenLLM.
raise bentoml.exceptions.NotFound(
f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
)
if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
if Version(model.info.api_version) < Version('v2'):
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
if model.info.labels['backend'] != llm.__llm_backend__:
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}."
)
return model
except bentoml.exceptions.NotFound as err:
if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
raise err from None
except Exception as err:
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
raise openllm.exceptions.OpenLLMException(
f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
'''Load the model from BentoML store.
By default, it will try to find check the model in the local store.
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
'''
config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
safe_serialization = openllm.utils.first_not_none(t.cast(
t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
attrs.pop('safe_serialization', None),
@@ -229,7 +219,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig',
llm.quantization_config),
trust_remote_code=llm.__llm_trust_remote_code__,
trust_remote_code=llm.trust_remote_code,
use_safetensors=safe_serialization,
**hub_attrs,
**attrs)
@@ -238,57 +228,9 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
*decls,
config=config,
trust_remote_code=llm.__llm_trust_remote_code__,
trust_remote_code=llm.trust_remote_code,
device_map=device_map,
**hub_attrs,
**attrs).eval()
# BetterTransformer is currently only supported on PyTorch.
if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
return t.cast('M', model)
def save_pretrained(llm: openllm.LLM[M, T],
save_directory: str,
is_main_process: bool = True,
state_dict: DictStrAny | None = None,
save_function: t.Any | None = None,
push_to_hub: bool = False,
max_shard_size: int | str = '10GB',
safe_serialization: bool = False,
variant: str | None = None,
**attrs: t.Any) -> None:
save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors'
# NOTE: disable safetensors for vllm
if llm.__llm_implementation__ == 'vllm': safe_serialization = False
if llm._quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM):
raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory,
use_safetensors=safe_serialization)
elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model):
raise RuntimeError(
"vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized."
)
elif isinstance(llm.model, transformers.Pipeline):
llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
else:
# We can safely cast here since it will be the PreTrainedModel protocol.
t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory,
is_main_process=is_main_process,
state_dict=state_dict,
save_function=save_function,
push_to_hub=push_to_hub,
max_shard_size=max_shard_size,
safe_serialization=safe_serialization,
variant=variant,
**model_save_attrs)
llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)

View File

@@ -76,7 +76,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
if type(config) in transformers.MODEL_FOR_CAUSAL_LM_MAPPING: idx = 0
elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1
else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.')
return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx])
return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_backend__][idx])
def check_unintialised_params(model: torch.nn.Module) -> None:
unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
@@ -104,11 +104,11 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)
default_config = ModelSignature(batchable=False)
if llm.__llm_implementation__ in {'pt', 'vllm'}:
if llm.__llm_backend__ in {'pt', 'vllm'}:
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
'group_beam_search', 'constrained_beam_search',
)
elif llm.__llm_implementation__ == 'tf':
elif llm.__llm_backend__ == 'tf':
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
'contrastive_search',
)

View File

@@ -23,9 +23,9 @@ class HfIgnore:
@classmethod
def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt]
elif llm.__llm_implementation__ == 'flax':
if llm.__llm_backend__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
elif llm.__llm_backend__ == 'tf': base = [cls.flax, cls.pt]
elif llm.__llm_backend__ == 'flax':
base = [cls.tf, cls.pt, cls.safetensors] # as of current, safetensors is not supported with flax
else:
base = [cls.tf, cls.flax]

View File

@@ -10,7 +10,7 @@ import bentoml
import openllm
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralBackend
logger = logging.getLogger(__name__)
@@ -18,10 +18,9 @@ logger = logging.getLogger(__name__)
def build_bento(model: str,
model_id: str | None = None,
quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
runtime: t.Literal['ggml', 'transformers'] = 'transformers',
cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
logger.info('Building BentoML for %s', model)
bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
bento = openllm.build(model, model_id=model_id, quantize=quantize)
yield bento
if cleanup:
logger.info('Deleting %s', bento.tag)
@@ -49,7 +48,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag,
@contextlib.contextmanager
def prepare(model: str,
model_id: str | None = None,
implementation: LiteralRuntime = 'pt',
implementation: LiteralBackend = 'pt',
deployment_mode: t.Literal['container', 'local'] = 'local',
clean_context: contextlib.ExitStack | None = None,
cleanup: bool = True) -> t.Iterator[str]:

View File

@@ -16,11 +16,11 @@ from . import dummy_vllm_objects as dummy_vllm_objects
if t.TYPE_CHECKING:
import openllm
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralBackend
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
return {
'runtime': llm.runtime,
'backend': llm.__llm_backend__,
'framework': 'openllm',
'model_name': llm.config['model_name'],
'architecture': llm.config['architecture'],
@@ -28,14 +28,13 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
}
def infer_auto_class(
implementation: LiteralRuntime
) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
import openllm
if implementation == 'tf': return openllm.AutoTFLLM
elif implementation == 'flax': return openllm.AutoFlaxLLM
elif implementation == 'pt': return openllm.AutoLLM
elif implementation == 'vllm': return openllm.AutoVLLM
else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
if backend == 'tf': return openllm.AutoTFLLM
elif backend == 'flax': return openllm.AutoFlaxLLM
elif backend == 'pt': return openllm.AutoLLM
elif backend == 'vllm': return openllm.AutoVLLM
else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')")
__all__ = [
'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',

View File

@@ -30,12 +30,10 @@ def model_settings(draw: st.DrawFn):
st.booleans(),
'requirements':
st.none() | st.lists(st.text(), min_size=1),
'default_implementation':
'default_backend':
st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
'model_type':
st.sampled_from(['causal_lm', 'seq2seq_lm']),
'runtime':
st.sampled_from(['transformers', 'ggml']),
'name_type':
st.sampled_from(['dasherize', 'lowercase']),
'timeout':

View File

@@ -111,10 +111,7 @@ def patch_env(**attrs: t.Any):
yield
def test_struct_envvar():
with patch_env(**{
field_env_key('env_llm', 'field1'): '4',
field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',
}):
with patch_env(**{field_env_key('field1'): '4', field_env_key('temperature', suffix='generation'): '0.2',}):
class EnvLLM(openllm.LLMConfig):
__config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
@@ -146,8 +143,8 @@ def test_struct_provided_fields():
def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mk:
mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2))
mk.setenv(field_env_key('field1'), str(4.0))
mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
sent = make_llm_config('OverwriteWithEnvAvailable', {
'default_id': 'asdfasdf',
'model_ids': ['asdf', 'asdfasdfads'],

View File

@@ -8,9 +8,9 @@ import pytest
import openllm
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralBackend
_FRAMEWORK_MAPPING = {
_MODELING_MAPPING = {
'flan_t5': 'google/flan-t5-small',
'opt': 'facebook/opt-125m',
'baichuan': 'baichuan-inc/Baichuan-7B',
@@ -22,19 +22,17 @@ _PROMPT_MAPPING = {
def parametrise_local_llm(
model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
runtime_impl: tuple[LiteralRuntime, ...] = tuple()
if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
llm = openllm.Runner(model,
model_id=_FRAMEWORK_MAPPING[model],
ensure_available=True,
implementation=framework,
init_local=True,
)
yield prompt, llm
if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
backends: tuple[LiteralBackend, ...] = tuple()
if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',)
if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',)
if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',)
for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()):
yield prompt, openllm.Runner(model,
model_id=_MODELING_MAPPING[model],
ensure_available=True,
backend=backend,
init_local=True)
def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
if os.getenv('GITHUB_ACTIONS') is None:

View File

@@ -4,6 +4,7 @@ import os
import typing as t
import pytest
import transformers
import openllm
@@ -28,7 +29,7 @@ def test_general_build_with_internal_testing():
bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
assert llm.llm_type == bento.info.labels['_type']
assert llm.config['env']['framework_value'] == bento.info.labels['_framework']
assert llm.config['env']['backend_value'] == bento.info.labels['_framework']
bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
assert len(bento_store.list(bento.tag)) == 1
@@ -38,10 +39,11 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
local_path = tmp_path_factory.mktemp('local_t5')
llm = openllm.AutoLLM.for_model('flan-t5', model_id=HF_INTERNAL_T5_TESTING, ensure_available=True)
if llm.bettertransformer:
llm.__llm_model__ = llm.model.reverse_bettertransformer()
llm.save_pretrained(local_path)
if isinstance(llm.model, transformers.Pipeline):
llm.model.save_pretrained(str(local_path))
else:
llm.model.save_pretrained(str(local_path))
llm.tokenizer.save_pretrained(str(local_path))
assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local')