feat: 1.2 APIs (#821)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2024-03-15 03:49:19 -04:00
committed by GitHub
parent e3392476be
commit 072b3e97ec
116 changed files with 4451 additions and 6144 deletions

View File

@@ -1,5 +1,5 @@
import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
from openllm_cli import _sdk
import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings, typing as _t
from . import utils as utils
if utils.DEBUG:
@@ -9,10 +9,16 @@ else:
# configuration for bitsandbytes before import
_os.environ['BITSANDBYTES_NOWELCOME'] = _os.environ.get('BITSANDBYTES_NOWELCOME', '1')
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization'
)
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization'
)
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
_warnings.filterwarnings(
'ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated'
)
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
__lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__name__,
@@ -22,21 +28,42 @@ __lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once my
'client': ['HTTPClient', 'AsyncHTTPClient'],
'bundle': [],
'testing': [],
'protocol': [],
'utils': [],
'_deprecated': ['Runner'],
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
'utils': ['api'],
'entrypoints': ['mount_entrypoints'],
'serialisation': ['ggml', 'transformers'],
'_quantisation': ['infer_quantisation_config'],
'serialisation': ['ggml', 'transformers', 'vllm'],
'_llm': ['LLM'],
'_deprecated': ['Runner'],
'_runners': ['runner'],
'_quantisation': ['infer_quantisation_config'],
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
},
extra_objects={
'COMPILED': COMPILED,
'start': _sdk.start,
'build': _sdk.build, #
'import_model': _sdk.import_model,
'list_models': _sdk.list_models, #
},
extra_objects={'COMPILED': COMPILED},
)
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
__all__, __dir__ = __lazy.__all__, __lazy.__dir__
_BREAKING_INTERNAL = ['_service', '_service_vars']
_NEW_IMPL = ['LLM', *_BREAKING_INTERNAL]
if (_BENTOML_VERSION := utils.pkg.pkg_version_info('bentoml')) > (1, 2):
import _openllm_tiny as _tiny
else:
_tiny = None
def __getattr__(name: str) -> _t.Any:
if name in _NEW_IMPL:
if utils.getenv('IMPLEMENTATION', default='new_impl') == 'deprecated' or _tiny is None:
if name in _BREAKING_INTERNAL:
raise ImportError(
f'"{name}" is an internal implementation and considered breaking with older OpenLLM. Please migrate your code if you depend on this.'
)
_warnings.warn(
f'"{name}" is considered deprecated implementation and will be removed in the future. Make sure to upgrade to OpenLLM 0.5.x',
DeprecationWarning,
stacklevel=3,
)
return __lazy.__getattr__(name)
else:
return getattr(_tiny, name)
else:
return __lazy.__getattr__(name)

View File

@@ -10,22 +10,29 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
* Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
"""
# fmt: off
# update-config-stubs.py: import stubs start
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GemmaConfig as GemmaConfig, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig
from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput, MessageParam as MessageParam
from openllm_core.utils import api as api
# update-config-stubs.py: import stubs stop
# fmt: on
from openllm_cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start
from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput
from . import bundle as bundle, client as client, exceptions as exceptions, serialisation as serialisation, utils as utils
from . import (
bundle as bundle,
client as client,
exceptions as exceptions,
serialisation as serialisation,
utils as utils,
entrypoints as entrypoints,
)
from .serialisation import ggml as ggml, transformers as transformers, vllm as vllm
from ._deprecated import Runner as Runner
from ._llm import LLM as LLM
from ._runners import runner as runner
from ._quantisation import infer_quantisation_config as infer_quantisation_config
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
from .entrypoints import mount_entrypoints as mount_entrypoints
from .protocol import openai as openai
from .serialisation import ggml as ggml, transformers as transformers
from _openllm_tiny import LLM as LLM
COMPILED: bool = ...

View File

@@ -19,7 +19,9 @@ def Runner(
if llm_config is None:
llm_config = openllm.AutoConfig.for_model(model_name)
if not ensure_available:
logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
logger.warning(
"'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
)
model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
warnings.warn(
f"""\
@@ -40,8 +42,14 @@ def Runner(
attrs.update({
'model_id': model_id,
'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
'serialisation': getenv(
'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
),
})
# XXX: Make this back to Runnable implementation
return openllm.LLM(
backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs
backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'),
llm_config=llm_config,
embedded=init_local,
**attrs,
).runner

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
import functools, logging, os, warnings, typing as t
import attr, inflection, orjson, bentoml, openllm
import attr, orjson, bentoml, openllm, openllm_core
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import (
AdapterMap,
@@ -10,22 +10,20 @@ from openllm_core._typing_compat import (
LiteralDtype,
LiteralQuantise,
LiteralSerialisation,
M,
T,
)
from openllm.serialisation import _make_tag_components
from openllm_core.exceptions import MissingDependencyError
from openllm_core.utils import (
DEBUG,
apply,
check_bool_env,
codegen,
first_not_none,
normalise_model_name,
flatten_attrs,
gen_random_uuid,
generate_hash_from_file,
getenv,
is_ctranslate_available,
is_peft_available,
is_transformers_available,
is_vllm_available,
resolve_filepath,
validate_is_path,
@@ -43,29 +41,50 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
_AdapterTuple: type[AdapterTuple] = codegen.make_attr_tuple_class('AdapterTuple', ['adapter_id', 'name', 'config'])
ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
CONFIG_FILE_NAME = 'config.json'
M = t.TypeVar('M')
T = t.TypeVar('T')
@attr.define(slots=False, repr=False, init=False)
class LLM(t.Generic[M, T]):
async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
async def generate(
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
):
if adapter_name is not None and self.__llm_backend__ != 'pt':
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
if stop is not None:
attrs.update({'stop': stop})
if stop_token_ids is not None:
attrs.update({'stop_token_ids': stop_token_ids})
config = self.config.model_construct_env(**attrs)
texts, token_ids = [[]] * config['n'], [[]] * config['n']
async for result in self.generate_iterator(
prompt, prompt_token_ids, stop, stop_token_ids, request_id, adapter_name, **config.model_dump(flatten=True)
prompt,
prompt_token_ids=prompt_token_ids,
request_id=request_id,
adapter_name=adapter_name,
**config.model_dump(),
):
for output in result.outputs:
texts[output.index].append(output.text)
token_ids[output.index].extend(output.token_ids)
if (final_result := result) is None:
raise RuntimeError('No result is returned.')
return final_result.with_options(
prompt=prompt,
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs],
return final_result.model_copy(
update=dict(
prompt=prompt,
outputs=[
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
for output in final_result.outputs
],
)
)
async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
async def generate_iterator(
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
):
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
if adapter_name is not None and self.__llm_backend__ != 'pt':
@@ -76,6 +95,7 @@ class LLM(t.Generic[M, T]):
raise RuntimeError('Runner client failed to set up correctly.')
else:
self.runner.init_local(quiet=True)
config = self.config.model_construct_env(**attrs)
stop_token_ids = stop_token_ids or []
@@ -93,36 +113,32 @@ class LLM(t.Generic[M, T]):
stop = {stop}
else:
stop = set(stop)
for tid in stop_token_ids:
if tid:
stop.add(self.tokenizer.decode(tid))
if prompt_token_ids is None:
if prompt is None:
raise ValueError('Either prompt or prompt_token_ids must be specified.')
prompt_token_ids = self.tokenizer.encode(prompt)
request_id = gen_random_uuid() if request_id is None else request_id
previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
config = config.model_construct_env(stop=list(stop), stop_token_ids=stop_token_ids)
try:
generator = self.runner.generate_iterator.async_stream(
prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True)
generator = bentoml.io.SSE.from_iterator(
self.runner.generate_iterator.async_stream(
prompt, request_id, prompt_token_ids=prompt_token_ids, adapter_name=adapter_name, **config.model_dump()
)
)
generator = bentoml.io.SSE.from_iterator(generator)
except Exception as err:
raise RuntimeError(f'Failed to start generation task: {err}') from err
try:
async for out in generator:
out = out.data
generated = GenerationOutput.from_runner(out).with_options(prompt=prompt)
generated = GenerationOutput.from_runner(out).model_copy(update=dict(prompt=prompt))
delta_outputs = [None] * len(generated.outputs)
for output in generated.outputs:
i = output.index
delta_tokens, delta_text = output.token_ids[previous_num_tokens[i] :], output.text[len(previous_texts[i]) :]
previous_texts[i], previous_num_tokens[i] = output.text, len(output.token_ids)
delta_outputs[i] = output.with_options(text=delta_text, token_ids=delta_tokens)
yield generated.with_options(outputs=delta_outputs)
delta_outputs[i] = output.model_copy(update=dict(text=delta_text, token_ids=delta_tokens))
yield generated.model_copy(update=dict(outputs=delta_outputs))
except Exception as err:
raise RuntimeError(f'Exception caught during generation: {err}') from err
@@ -130,7 +146,9 @@ class LLM(t.Generic[M, T]):
# The below are mainly for internal implementation that you don't have to worry about.
_model_id: str
_revision: t.Optional[str] #
_quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
_quantization_config: t.Optional[
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
]
_quantise: t.Optional[LiteralQuantise]
_model_decls: t.Tuple[t.Any, ...]
__model_attrs: t.Dict[str, t.Any] #
@@ -146,7 +164,9 @@ class LLM(t.Generic[M, T]):
__llm_torch_dtype__: 'torch.dtype' = None
__llm_config__: t.Optional[LLMConfig] = None
__llm_backend__: LiteralBackend = None
__llm_quantization_config__: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]] = None
__llm_quantization_config__: t.Optional[
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
] = None
__llm_runner__: t.Optional[Runner[M, T]] = None
__llm_model__: t.Optional[M] = None
__llm_tokenizer__: t.Optional[T] = None
@@ -177,7 +197,9 @@ class LLM(t.Generic[M, T]):
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
if torch_dtype is not None:
warnings.warn(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
DeprecationWarning,
stacklevel=3,
)
dtype = torch_dtype
_local = False
@@ -195,7 +217,7 @@ class LLM(t.Generic[M, T]):
# parsing tokenizer and model kwargs, as the hierarchy is param pass > default
model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
if model_tag is None:
model_tag, model_version = self._make_tag_components(model_id, model_version, backend=backend)
model_tag, model_version = _make_tag_components(model_id, model_version)
if model_version:
model_tag = f'{model_tag}:{model_version}'
@@ -233,25 +255,12 @@ class LLM(t.Generic[M, T]):
class _Quantise:
@staticmethod
def pt(llm: LLM, quantise=None): return quantise
@staticmethod
def vllm(llm: LLM, quantise=None): return quantise
@staticmethod
def ctranslate(llm: LLM, quantise=None):
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
def pt(llm: LLM, quantise=None):
return quantise
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
def _make_tag_components(self, model_id: str, model_version: str | None, backend: str) -> tuple[str, str | None]:
model_id, *maybe_revision = model_id.rsplit(':')
if len(maybe_revision) > 0:
if model_version is not None:
logger.warning("revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
model_version = maybe_revision[0]
if validate_is_path(model_id):
model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
return f'{backend}-{normalise_model_name(model_id)}', model_version
@staticmethod
def vllm(llm: LLM, quantise=None):
return quantise
@functools.cached_property
def _has_gpus(self):
@@ -259,9 +268,11 @@ class LLM(t.Generic[M, T]):
from cuda import cuda
err, *_ = cuda.cuInit(0)
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to initialise CUDA runtime binding.')
err, _ = cuda.cuDeviceGetCount()
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to get CUDA device count.')
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to get CUDA device count.')
return True
except (ImportError, RuntimeError):
return False
@@ -272,16 +283,13 @@ class LLM(t.Generic[M, T]):
_map = _torch_dtype_mapping()
if not isinstance(self.__llm_torch_dtype__, torch.dtype):
try:
hf_config = transformers.AutoConfig.from_pretrained(self.bentomodel.path, trust_remote_code=self.trust_remote_code)
except OpenLLMException:
hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
config_dtype = getattr(hf_config, 'torch_dtype', None)
if config_dtype is None:
config_dtype = torch.float32
hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
config_dtype = getattr(hf_config, 'torch_dtype', torch.float32)
if self.__llm_dtype__ == 'auto':
if config_dtype == torch.float32:
if torch.cuda.is_available() and config_dtype is torch.float32:
torch_dtype = torch.float16
elif not torch.cuda.is_available():
torch_dtype = torch.float32
else:
torch_dtype = config_dtype
else:
@@ -304,14 +312,11 @@ class LLM(t.Generic[M, T]):
return {**self.import_kwargs[1], **self.__tokenizer_attrs}
def _cascade_backend(self) -> LiteralBackend:
logger.warning('It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.')
if self._has_gpus:
if is_vllm_available():
return 'vllm'
elif is_ctranslate_available():
return 'ctranslate'
elif is_ctranslate_available():
return 'ctranslate'
logger.warning(
'It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.'
)
if self._has_gpus and is_vllm_available():
return 'vllm'
else:
return 'pt'
@@ -339,7 +344,10 @@ class LLM(t.Generic[M, T]):
@property
def import_kwargs(self):
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {'padding_side': 'left', 'truncation_side': 'left'}
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
'padding_side': 'left',
'truncation_side': 'left',
}
@property
def trust_remote_code(self):
@@ -362,7 +370,7 @@ class LLM(t.Generic[M, T]):
@property
def bentomodel(self):
return openllm.serialisation.get(self)
return bentoml.models.get(self.tag)
@property
def quantization_config(self):
@@ -372,7 +380,9 @@ class LLM(t.Generic[M, T]):
if self._quantization_config is not None:
self.__llm_quantization_config__ = self._quantization_config
elif self._quantise is not None:
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self, self._quantise, **self._model_attrs)
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
self, self._quantise, **self._model_attrs
)
else:
raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
return self.__llm_quantization_config__
@@ -400,7 +410,7 @@ class LLM(t.Generic[M, T]):
@property
def identifying_params(self):
return {
'configuration': self.config.model_dump_json().decode(),
'configuration': self.config.model_dump_json(),
'model_ids': orjson.dumps(self.config['model_ids']).decode(),
'model_id': self.model_id,
}
@@ -427,7 +437,11 @@ class LLM(t.Generic[M, T]):
model = get_peft_model(
prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
self.config['fine_tune_strategies'].get(adapter_type, self.config.make_fine_tune_config(adapter_type)).train().with_config(**attrs).build(),
self.config['fine_tune_strategies']
.get(adapter_type, self.config.make_fine_tune_config(adapter_type))
.train()
.with_config(**attrs)
.build(),
)
if DEBUG:
model.print_trainable_parameters()
@@ -447,7 +461,10 @@ class LLM(t.Generic[M, T]):
if self.__llm_adapter_map__ is None:
_map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
for adapter_type, adapter_tuple in self._adapter_map.items():
base = first_not_none(self.config['fine_tune_strategies'].get(adapter_type), default=self.config.make_fine_tune_config(adapter_type))
base = first_not_none(
self.config['fine_tune_strategies'].get(adapter_type),
default=self.config.make_fine_tune_config(adapter_type),
)
for adapter in adapter_tuple:
_map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
self.__llm_adapter_map__ = _map
@@ -456,50 +473,44 @@ class LLM(t.Generic[M, T]):
@property
def model(self):
if self.__llm_model__ is None:
model = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
if self.__llm_backend__ == 'pt':
import torch
loaded_in_kbit = (
getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
model = model.to('cuda')
except Exception as err:
raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
if self.has_adapters:
logger.debug('Applying the following adapters: %s', self.adapter_map)
for adapter_dict in self.adapter_map.values():
for adapter_name, (peft_config, peft_model_id) in adapter_dict.items():
model.load_adapter(peft_model_id, adapter_name, peft_config=peft_config)
self.__llm_model__ = model
self.__llm_model__ = openllm.serialisation.load_model(self, *self._model_decls, **self._model_attrs)
return self.__llm_model__
@property
def config(self):
import transformers
if self.__llm_config__ is None:
if self.__llm_backend__ == 'ctranslate':
try:
config = transformers.AutoConfig.from_pretrained(self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code)
except OpenLLMException:
config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
for architecture in config.architectures:
if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
config = openllm.AutoConfig.infer_class_from_name(
openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
).model_construct_env(**self._model_attrs)
break
else:
raise OpenLLMException(
f"Failed to infer the configuration class. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}"
)
if self._local:
config_file = os.path.join(self.model_id, CONFIG_FILE_NAME)
else:
config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs)
self.__llm_config__ = config
try:
config_file = self.bentomodel.path_of(CONFIG_FILE_NAME)
except OpenLLMException as err:
if not is_transformers_available():
raise MissingDependencyError(
"Requires 'transformers' to be available. Do 'pip install transformers'"
) from err
from transformers.utils import cached_file
try:
config_file = cached_file(self.model_id, CONFIG_FILE_NAME)
except Exception as err:
raise ValueError(
"Failed to determine architecture from 'config.json'. If this is a gated model, make sure to pass in HUGGING_FACE_HUB_TOKEN"
) from err
if not os.path.exists(config_file):
raise ValueError(f"Failed to find 'config.json' (config_json_path={config_file})")
with open(config_file, 'r', encoding='utf-8') as f:
loaded_config = orjson.loads(f.read())
if 'architectures' in loaded_config:
for architecture in loaded_config['architectures']:
if architecture in self._architecture_mappings:
self.__llm_config__ = openllm_core.AutoConfig.for_model(
self._architecture_mappings[architecture]
).model_construct_env()
break
else:
raise ValueError(f"Failed to find architecture from 'config.json' (config_json_path={config_file})")
return self.__llm_config__
@@ -516,13 +527,11 @@ def _torch_dtype_mapping() -> dict[str, torch.dtype]:
}
def normalise_model_name(name: str) -> str:
return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/', '--'))
def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
if not is_peft_available():
raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
raise RuntimeError(
"LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
)
from huggingface_hub import hf_hub_download
resolved: AdapterMap = {}

View File

@@ -1,4 +1,4 @@
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypedDict, Union
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Optional, Tuple, TypedDict, Union, TypeVar
import attr
import torch
@@ -7,13 +7,26 @@ from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2Seq
from bentoml import Model, Tag
from openllm_core import LLMConfig
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import AdapterMap, AdapterType, LiteralBackend, LiteralDtype, LiteralQuantise, LiteralSerialisation, M, T
from openllm_core._schemas import GenerationOutput, GenerationInputDict, MetadataOutput
from openllm_core._typing_compat import (
AdapterMap,
AdapterType,
LiteralBackend,
LiteralQuantise,
LiteralSerialisation,
ParamSpec,
MessagesConverterInput,
)
from openllm_core.utils import api
from ._quantisation import QuantizationConfig
from ._runners import Runner
from _openllm_tiny._llm import Dtype
InjectedModel = Union[PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM]
P = ParamSpec('P')
M = TypeVar('M')
T = TypeVar('T')
class IdentifyingParams(TypedDict):
configuration: str
@@ -21,8 +34,16 @@ class IdentifyingParams(TypedDict):
model_id: str
ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
CTranslateDtype = Literal['int8_float32', 'int8_float16', 'int8_bfloat16']
Dtype = Union[LiteralDtype, CTranslateDtype, Literal['auto', 'half', 'float']]
class LLMService:
@api
async def generate_v1(self, parameters: GenerationInputDict = ...) -> GenerationOutput: ...
@api
async def generate_stream_v1(self, parameters: GenerationInputDict = ...) -> AsyncGenerator[str, None]: ...
@api
def metadata_v1(self) -> MetadataOutput: ...
@api
def helpers_messages_v1(self, message: MessagesConverterInput = ...) -> str: ...
@attr.define(slots=True, repr=False, init=False)
class LLM(Generic[M, T]):
@@ -37,6 +58,8 @@ class LLM(Generic[M, T]):
_adapter_map: Optional[AdapterMap]
_serialisation: LiteralSerialisation
_local: bool
_max_model_len: Optional[int]
_gpu_memory_utilization: float
__llm_dtype__: Dtype = ...
__llm_torch_dtype__: Optional[torch.dtype] = ...
@@ -49,7 +72,26 @@ class LLM(Generic[M, T]):
__llm_adapter_map__: Optional[ResolvedAdapterMap] = ...
__llm_trust_remote_code__: bool = ...
def __repr__(self) -> str: ...
async def generate(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> GenerationOutput: ...
async def generate_iterator(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[GenerationOutput, None]: ...
def __init__(
self,
model_id: str,
@@ -66,6 +108,8 @@ class LLM(Generic[M, T]):
embedded: bool = ...,
dtype: Dtype = ...,
low_cpu_mem_usage: bool = ...,
max_model_len: Optional[int] = ...,
gpu_memory_utilization: float = ...,
**attrs: Any,
) -> None: ...
@property
@@ -91,8 +135,6 @@ class LLM(Generic[M, T]):
@property
def quantization_config(self) -> QuantizationConfig: ...
@property
def has_adapters(self) -> bool: ...
@property
def local(self) -> bool: ...
@property
def quantise(self) -> Optional[LiteralQuantise]: ...
@@ -112,24 +154,6 @@ class LLM(Generic[M, T]):
def runner(self) -> Runner[M, T]: ...
@property
def adapter_map(self) -> ResolvedAdapterMap: ...
def prepare(self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any) -> Tuple[InjectedModel, T]: ...
async def generate(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> GenerationOutput: ...
async def generate_iterator(
self,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
stop_token_ids: Optional[List[int]] = ...,
request_id: Optional[str] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[GenerationOutput, None]: ...
def prepare(
self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
) -> Tuple[InjectedModel, T]: ...

View File

@@ -83,19 +83,25 @@ def infer_quantisation_config(llm, quantise, **attrs):
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
if not is_bitsandbytes_available():
raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
raise RuntimeError(
'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
)
if quantise == 'int8':
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
quantisation_config = create_int4_config()
elif quantise == 'gptq':
if not is_autogptq_available():
raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
raise MissingDependencyError(
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
)
else:
quantisation_config = create_gptq_config()
elif quantise == 'awq':
if not is_autoawq_available():
raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
raise MissingDependencyError(
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
)
else:
quantisation_config = create_awq_config()
else:

View File

@@ -9,10 +9,18 @@ from ._llm import LLM
QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
@overload
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any) -> tuple[GPTQConfig, Dict[str, Any]]: ...
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
) -> tuple[GPTQConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['awq'], **attrs: Any) -> tuple[AwqConfig, Dict[str, Any]]: ...
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
) -> tuple[AwqConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
def infer_quantisation_config(
self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
) -> tuple[QuantizationConfig, Dict[str, Any]]: ...

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
import gc, traceback, types, typing as t
import gc, types, typing as t
import torch, bentoml, openllm
from openllm_core._schemas import CompletionChunk, GenerationOutput, SampleLogprobs
from openllm_core.utils import ReprMixin, is_ctranslate_available, is_vllm_available
from openllm_core.utils import ReprMixin, is_vllm_available
if t.TYPE_CHECKING:
from openllm_core._typing_compat import M, T
@@ -46,11 +46,14 @@ def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
(
'runner_methods',
{
method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None}
method.name: {
'batchable': method.config.batchable,
'batch_dim': method.config.batch_dim if method.config.batchable else None,
}
for method in _.runner_methods
},
),
('config', llm.config.model_dump(flatten=True)),
('config', llm.config.model_dump()),
('llm_type', llm.llm_type),
('backend', llm.__llm_backend__),
('llm_tag', llm.tag),
@@ -68,49 +71,6 @@ def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
)
@registry
class CTranslateRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
def __init__(self, llm):
if not is_ctranslate_available():
raise openllm.exceptions.OpenLLMException('ctranslate is not installed. Do `pip install "openllm[ctranslate]"`')
self.llm, self.config, self.model, self.tokenizer = llm, llm.config, llm.model, llm.tokenizer
@bentoml.Runnable.method(batchable=False)
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
config, sampling_params = self.config.model_construct_env(stop=list(stop), **attrs).inference_options(self.llm)
cumulative_logprob, output_token_ids, input_len = 0.0, list(prompt_token_ids), len(prompt_token_ids)
tokens = self.tokenizer.convert_ids_to_tokens(prompt_token_ids)
async for request_output in self.model.async_generate_tokens(tokens, **sampling_params):
if config['logprobs']:
cumulative_logprob += request_output.log_prob
output_token_ids.append(request_output.token_id)
text = self.tokenizer.decode(
output_token_ids[input_len:],
skip_special_tokens=True, #
spaces_between_special_tokens=False,
clean_up_tokenization_spaces=True, #
)
out = GenerationOutput(
prompt_token_ids=prompt_token_ids, #
prompt='',
finished=request_output.is_last,
request_id=request_id, #
outputs=[
CompletionChunk(
index=0,
text=text,
finish_reason=None, #
token_ids=output_token_ids[input_len:],
cumulative_logprob=cumulative_logprob, #
# TODO: logprobs, but seems like we don't have access to the raw logits
)
],
).model_dump_json()
yield bentoml.io.SSE(out).marshal()
@registry
class vLLMRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
@@ -119,41 +79,17 @@ class vLLMRunnable(bentoml.Runnable):
def __init__(self, llm):
if not is_vllm_available():
raise openllm.exceptions.OpenLLMException('vLLM is not installed. Do `pip install "openllm[vllm]"`.')
import vllm
self.llm, self.config, self.tokenizer = llm, llm.config, llm.tokenizer
num_gpus, dev = 1, openllm.utils.device_count()
if dev >= 2:
num_gpus = min(dev // 2 * 2, dev)
quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet.
try:
self.model = vllm.AsyncLLMEngine.from_engine_args(
vllm.AsyncEngineArgs(
worker_use_ray=False,
engine_use_ray=False, #
tokenizer_mode='auto',
tensor_parallel_size=num_gpus, #
model=llm.bentomodel.path,
tokenizer=llm.bentomodel.path, #
trust_remote_code=llm.trust_remote_code,
dtype=dtype, #
max_model_len=llm._max_model_len,
gpu_memory_utilization=llm._gpu_memory_utilization, #
quantization=quantise,
)
)
except Exception as err:
traceback.print_exc()
raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
self.llm, self.config, self.model = llm, llm.config, llm.model
@bentoml.Runnable.method(batchable=False)
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
async def generate_iterator(self, prompt, request_id, prompt_token_ids=None, stop=None, adapter_name=None, **attrs):
_, sampling_params = self.config.model_construct_env(stop=stop, **attrs).inference_options(self.llm)
async for request_output in self.model.generate(None, sampling_params, request_id, prompt_token_ids):
async for request_output in self.model.generate(
prompt, sampling_params=sampling_params, request_id=request_id, prompt_token_ids=prompt_token_ids
):
out = GenerationOutput.from_vllm(request_output).model_dump_json()
out = bentoml.io.SSE(out).marshal()
yield out
yield bentoml.io.SSE(out).marshal()
@registry(alias='pt')
@@ -170,12 +106,17 @@ class PyTorchRunnable(bentoml.Runnable):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@bentoml.Runnable.method(batchable=False)
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
async def generate_iterator(self, prompt, request_id, prompt_token_ids=None, stop=None, adapter_name=None, **attrs):
from ._generation import get_context_length, prepare_logits_processor
if adapter_name is not None:
self.model.set_adapter(adapter_name)
if prompt_token_ids is None:
if prompt is None:
raise ValueError('Either prompt or prompt_token_ids must be specified.')
prompt_token_ids = self.tokenizer.encode(prompt)
max_new_tokens = attrs.pop('max_new_tokens', 256)
context_length = attrs.pop('context_length', None)
if context_length is None:
@@ -202,7 +143,9 @@ class PyTorchRunnable(bentoml.Runnable):
if config['logprobs']: # FIXME: logprobs is not supported
raise NotImplementedError('Logprobs is yet to be supported with encoder-decoder models.')
encoder_output = self.model.encoder(input_ids=torch.as_tensor([prompt_token_ids], device=self.device))[0]
start_ids = torch.as_tensor([[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device)
start_ids = torch.as_tensor(
[[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device
)
else:
start_ids = torch.as_tensor([prompt_token_ids], device=self.device)
@@ -230,7 +173,9 @@ class PyTorchRunnable(bentoml.Runnable):
)
logits = self.model.lm_head(out[0])
else:
out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True)
out = self.model(
input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True
)
logits = out.logits
past_key_values = out.past_key_values
if logits_processor:
@@ -274,7 +219,12 @@ class PyTorchRunnable(bentoml.Runnable):
tmp_output_ids, rfind_start = output_token_ids[input_len:], 0
# XXX: Move this to API server
text = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
text = self.tokenizer.decode(
tmp_output_ids,
skip_special_tokens=True,
spaces_between_special_tokens=False,
clean_up_tokenization_spaces=True,
)
if len(stop) > 0:
for it in stop:

View File

@@ -1,40 +1,67 @@
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Protocol, Tuple, Type, TypeVar, Union, final
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer
from bentoml import Model, Strategy, Tag
from typing import (
Any,
TypeVar,
Protocol,
AsyncGenerator,
List,
Optional,
Iterable,
Dict,
Union,
Tuple,
Generic,
Type,
Literal,
)
import bentoml
from bentoml._internal.runner.runner_handle import RunnerHandle
from openllm_core import LLMConfig
from openllm_core._typing_compat import LiteralBackend, M, T
from openllm_core._typing_compat import M, T, LiteralBackend
from ._llm import LLM
try:
from vllm import AsyncLLMEngine
except ImportError:
AsyncLLMEngine = Any
try:
from ctranslate2 import Generator, Translator
except ImportError:
Translator = Generator = Any
Mo = TypeVar('Mo')
To = TypeVar('To')
__all_ = ['Runner', 'runner']
def runner(llm: LLM[M, T]) -> Runner[M, T]: ...
# class Runner(Protocol[Mo, To]):
# __doc__: str = ...
# __module__: str = ...
# llm: LLM[Mo, To] = ...
# llm_config: LLMConfig = ...
# llm_type: str = ...
# llm_tag: bentoml.Tag = ...
# llm_bentomodel: bentoml.Model = ...
# identifying_params: Dict[str, Any] = ...
# backend: LiteralBackend = ...
# template: str = ...
# system_message: str = ...
#
# @api # type: ignore[arg-type] # XXX: I don't really know how to fix this for marking positional-only arg as self?
# async def generate_iterator(
# self,
# prompt_token_ids: List[int],
# request_id: str,
# stop: Optional[Iterable[str]] = ...,
# adapter_name: Optional[str] = ...,
# **attrs: Any,
# ) -> AsyncGenerator[GenerationOutput, None]: ...
class _Runnable(Protocol[Mo, To]):
SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu', 'amd.com/gpu', 'cpu'], ...] = ...
SUPPORTS_CPU_MULTI_THREADING: bool = ...
llm: LLM[Mo, To] = ...
config: LLMConfig = ...
model: Mo = ...
tokenizer: To = ...
def __init__(self, llm: LLM[Mo, T]) -> None: ...
def __init__(self, llm: LLM[Mo, To]) -> None: ...
async def generate_iterator(
self,
prompt_token_ids: List[int],
prompt: str,
request_id: str,
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[str, Iterable[str]]] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
@@ -45,36 +72,27 @@ Ret = TypeVar('Ret')
class RunnerMethod(Generic[In, Ret]): ...
@final
class vLLMRunnable(_Runnable[AsyncLLMEngine, PreTrainedTokenizer]): ...
@final
class CTranslateRunnable(_Runnable[Union[Translator, Generator], PreTrainedTokenizer]): ...
@final
class PyTorchRunnable(_Runnable[PreTrainedModel, PreTrainedTokenizer]):
is_encoder_decoder: bool = ...
device: torch.device = ...
def runner(llm: LLM[M, T]) -> Runner[M, T]: ...
class Runner(Protocol[Mo, To]):
__doc__: str = ...
__module__: str = ...
llm_type: str = ...
llm_tag: Tag = ...
llm_tag: bentoml.Tag = ...
identifying_params: Dict[str, Any] = ...
llm: LLM[Mo, To] = ...
config: LLMConfig = ...
backend: LiteralBackend = ...
has_adapters: bool = ...
template: str = ...
system_message: str = ...
class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
@staticmethod
def async_stream(
prompt_token_ids: List[int], request_id: str, stop: Optional[Union[Iterable[str], str]] = ..., adapter_name: Optional[str] = ..., **attrs: Any
prompt: str,
request_id: str,
prompt_token_ids: Optional[List[int]] = ...,
stop: Optional[Union[Iterable[str], str]] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
) -> AsyncGenerator[str, None]: ...
def __init__(
@@ -83,8 +101,8 @@ class Runner(Protocol[Mo, To]):
*,
runnable_init_params: Optional[Dict[str, Any]] = ...,
name: Optional[str] = ...,
scheduling_strategy: Type[Strategy] = ...,
models: Optional[List[Model]] = ...,
scheduling_strategy: Type[bentoml.Strategy] = ...,
models: Optional[List[bentoml.Model]] = ...,
max_batch_size: Optional[int] = ...,
max_latency_ms: Optional[int] = ...,
method_configs: Optional[Dict[str, Dict[str, int]]] = ...,
@@ -92,12 +110,12 @@ class Runner(Protocol[Mo, To]):
) -> None: ...
name: str = ...
models: List[Model] = ...
models: List[bentoml.Model] = ...
resource_config: Dict[str, Any]
runnable_class: Type[_Runnable[Mo, To]]
embedded: bool
runner_methods: List[RunnerMethod[Any, Any]]
scheduling_strategy: Type[Strategy]
scheduling_strategy: Type[bentoml.Strategy]
workers_per_resource: Union[int, float] = ...
runnable_init_params: Dict[str, Any] = ...
_runner_handle: RunnerHandle = ...

View File

@@ -1,71 +0,0 @@
from __future__ import annotations
import logging, typing as t
import bentoml, openllm, _service_vars as svars
from openllm_core._schemas import MessageParam
from bentoml.io import JSON, Text
logger = logging.getLogger(__name__)
llm = openllm.LLM[t.Any, t.Any](
model_id=svars.model_id,
model_tag=svars.model_tag,
adapter_map=svars.adapter_map, #
serialisation=svars.serialization,
trust_remote_code=svars.trust_remote_code, #
max_model_len=svars.max_model_len,
gpu_memory_utilization=svars.gpu_memory_utilization, #
)
svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
yield f'data: {it.model_dump_json()}\n\n'
yield 'data: [DONE]\n\n'
_Metadata = openllm.MetadataOutput(
timeout=llm.config['timeout'],
model_name=llm.config['model_name'], #
backend=llm.__llm_backend__,
model_id=llm.model_id,
configuration=llm.config.model_dump_json().decode(), #
)
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
def metadata_v1(_: str) -> openllm.MetadataOutput:
return _Metadata
class MessagesConverterInput(t.TypedDict):
add_generation_prompt: bool
messages: t.List[t.Dict[str, t.Any]]
@svc.api(
route='/v1/helpers/messages',
input=JSON.from_sample(
MessagesConverterInput(
add_generation_prompt=False,
messages=[
MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
MessageParam(role='user', content='Hi there!'),
MessageParam(role='assistant', content='Yes?'), #
],
)
),
output=Text(),
)
def helpers_messages_v1(message: MessagesConverterInput) -> str:
add_generation_prompt, messages = message['add_generation_prompt'], message['messages']
return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.

View File

@@ -1,13 +0,0 @@
import os, orjson, openllm_core.utils as coreutils
model_id, model_tag, adapter_map, serialization, trust_remote_code = (
os.environ['OPENLLM_MODEL_ID'],
None,
orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
)
max_model_len, gpu_memory_utilization = (
orjson.loads(os.getenv('MAX_MODEL_LEN', orjson.dumps(None).decode())),
orjson.loads(os.getenv('GPU_MEMORY_UTILIZATION', orjson.dumps(0.9).decode())),
)

View File

@@ -1,9 +0,0 @@
from typing import Dict, Optional
from openllm_core._typing_compat import LiteralSerialisation
model_id: str = ...
model_tag: Optional[str] = ...
adapter_map: Optional[Dict[str, str]] = ...
serialization: LiteralSerialisation = ...
trust_remote_code: bool = ...

View File

@@ -158,12 +158,16 @@ class _ResourceMixin:
elif isinstance(spec, list):
return [str(x) for x in spec]
else:
raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
raise TypeError(
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
)
@staticmethod
def validate(cls, val: list[t.Any]) -> None:
if cls.resource_id == 'amd.com/gpu':
raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
raise RuntimeError(
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
)
if not all(isinstance(i, str) for i in val):
raise ValueError('Input list should be all string type.')
@@ -307,12 +311,18 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
worker_index,
assigned_resource_per_worker,
)
raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
raise IndexError(
f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
)
assigned_gpu = gpus[
assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
]
dev = ','.join(assigned_gpu)
else:
idx = worker_index // workers_per_resource
if idx >= len(gpus):
raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
raise ValueError(
f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
)
dev = str(gpus[idx])
return dev

View File

@@ -13,7 +13,12 @@ class CascadingResourceStrategy:
TODO: Support CloudTPUResource
"""
@classmethod
def get_worker_count(cls, runnable_class: Type[bentoml.Runnable], resource_request: Optional[Dict[str, Any]], workers_per_resource: float) -> int:
def get_worker_count(
cls,
runnable_class: Type[bentoml.Runnable],
resource_request: Optional[Dict[str, Any]],
workers_per_resource: float,
) -> int:
"""Return the number of workers to be used for the given runnable class.
Note that for all available GPU, the number of workers will always be 1.
@@ -35,5 +40,7 @@ class CascadingResourceStrategy:
worker_index: The index of the worker, start from 0.
"""
@staticmethod
def transpile_workers_to_cuda_envvar(workers_per_resource: Union[float, int], gpus: List[str], worker_index: int) -> str:
def transpile_workers_to_cuda_envvar(
workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
) -> str:
"""Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""

View File

@@ -10,7 +10,7 @@ logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code,max_model_len,gpu_memory_utilization='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__},{__max_model_len__},{__gpu_memory_utilization__}'''
_SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code,max_model_len,gpu_memory_utilization,services_config='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__},{__max_model_len__},{__gpu_memory_utilization__},orjson.loads("""{__services_config__}""")'''
def build_editable(path, package='openllm'):
@@ -38,8 +38,11 @@ def build_editable(path, package='openllm'):
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
from . import RefResolver
openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == "vllm" else "openllm"
packages = ['scipy', 'bentoml[tracing]>=1.1.11,<1.2', f'{openllm_package}>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
packages = [
'scipy',
'bentoml[tracing]>=1.2',
f'openllm[vllm]>={RefResolver.from_strategy("release").version}',
] # apparently bnb misses this one
if adapter_map is not None:
packages += ['openllm[fine-tune]']
if extra_dependencies is not None:
@@ -57,7 +60,18 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N
def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, serialisation):
from openllm_cli.entrypoint import process_environ
environ = process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm, use_current_env=False)
environ = process_environ(
llm.config,
llm.config['timeout'],
1.0,
None,
True,
llm.model_id,
None,
llm._serialisation,
llm,
use_current_env=False,
)
# XXX: We need to quote this so that the envvar in container recognize as valid json
environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container
@@ -86,7 +100,10 @@ def create_bento(
'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id,
'bundler': 'openllm.bundle',
**{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
**{
f'{package.replace("-", "_")}_version': importlib.metadata.version(package)
for package in {'openllm', 'openllm-core', 'openllm-client'}
},
})
if adapter_map:
labels.update(adapter_map)

View File

@@ -13,7 +13,10 @@ from .._llm import LLM
def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
def construct_python_options(
llm: LLM[M, T], llm_fs: FS, extra_dependencies: Optional[Tuple[str, ...]] = ..., adapter_map: Optional[Dict[str, str]] = ...
llm: LLM[M, T],
llm_fs: FS,
extra_dependencies: Optional[Tuple[str, ...]] = ...,
adapter_map: Optional[Dict[str, str]] = ...,
) -> PythonOptions: ...
def construct_docker_options(
llm: LLM[M, T],

View File

@@ -1,15 +1,16 @@
import importlib
from openllm_core.utils import LazyModule
_import_structure = {'openai': [], 'hf': [], 'cohere': []}
_import_structure = {'openai': [], 'hf': []}
def mount_entrypoints(svc, llm):
for module_name in _import_structure:
module = importlib.import_module(f'.{module_name}', __name__)
svc = module.mount_to_svc(svc, llm)
svc = importlib.import_module(f'.{module_name}', __name__).mount_to_svc(svc, llm)
return svc
__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
__lazy = LazyModule(
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
)
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,16 +1,17 @@
"""Entrypoint for all third-party apps.
Currently support OpenAI, Cohere compatible API.
Currently support OpenAI compatible API.
Each module should implement the following API:
- `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
"""
from bentoml import Service
from typing import Any
from _bentoml_sdk import Service
from openllm_core._typing_compat import M, T
from . import cohere as cohere, hf as hf, openai as openai
from . import hf as hf, openai as openai
from .._llm import LLM
def mount_entrypoints(svc: Service, llm: LLM[M, T]) -> Service: ...
def mount_entrypoints(svc: Service[Any], llm: LLM[M, T]) -> Service: ...

View File

@@ -1,3 +1,5 @@
from __future__ import annotations
import functools
import inspect
import types
@@ -9,6 +11,9 @@ from starlette.schemas import EndpointInfo, SchemaGenerator
from openllm_core.utils import first_not_none
if t.TYPE_CHECKING:
import pydantic
OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
# NOTE: OpenAI schema
LIST_MODELS_SCHEMA = """\
@@ -186,9 +191,7 @@ COMPLETIONS_SCHEMA = """\
consumes:
- application/json
description: >-
Given a prompt, the model will return one or more predicted completions, and
can also return the probabilities of alternative tokens at each position. We
recommend most users use our Chat completions API.
Given a prompt, the model will return one or more predicted completions, and can also return the probabilities of alternative tokens at each position. We recommend most users use our Chat completions API.
operationId: openai__completions
produces:
- application/json
@@ -210,7 +213,7 @@ requestBody:
model: __model_id__
max_tokens: 256
temperature: 0.7
logprobs: 1
logprobs: null
top_p: 0.43
n: 1
stream: false
@@ -222,7 +225,7 @@ requestBody:
max_tokens: 256
temperature: 0.7
top_p: 0.43
logprobs: 1
logprobs: null
n: 1
stream: true
stop:
@@ -501,7 +504,11 @@ class OpenLLMSchemaGenerator(SchemaGenerator):
endpoints_info.extend(sub_endpoints)
elif not isinstance(route, Route) or not route.include_in_schema:
continue
elif inspect.isfunction(route.endpoint) or inspect.ismethod(route.endpoint) or isinstance(route.endpoint, functools.partial):
elif (
inspect.isfunction(route.endpoint)
or inspect.ismethod(route.endpoint)
or isinstance(route.endpoint, functools.partial)
):
endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
path = self._remove_converter(route.path)
for method in route.methods or ['GET']:
@@ -546,11 +553,13 @@ def get_generator(title, components=None, tags=None, inject=True):
return OpenLLMSchemaGenerator(base_schema)
def component_schema_generator(attr_cls, description=None):
def component_schema_generator(attr_cls: pydantic.BaseModel, description=None):
schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
schema['description'] = first_not_none(getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}')
for field in attr.fields(attr.resolve_types(attr_cls)):
attr_type = field.type
schema['description'] = first_not_none(
getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
)
for name, field in attr_cls.model_fields.items():
attr_type = field.annotation
origin_type = t.get_origin(attr_type)
args_type = t.get_args(attr_type)
@@ -582,15 +591,18 @@ def component_schema_generator(attr_cls, description=None):
if field.default is not attr.NOTHING and not isinstance(field.default, attr.Factory):
prop_schema['default'] = field.default
if field.default is attr.NOTHING and not isinstance(attr_type, type(t.Optional)):
schema['required'].append(field.name)
schema['properties'][field.name] = prop_schema
schema['required'].append(name)
schema['properties'][name] = prop_schema
locals().pop('prop_schema', None)
return schema
_SimpleSchema = types.new_class(
'_SimpleSchema', (object,), {}, lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it})
'_SimpleSchema',
(object,),
{},
lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
)

View File

@@ -17,8 +17,13 @@ class OpenLLMSchemaGenerator:
def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
def append_schemas(svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...) -> Service: ...
def append_schemas(
svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
) -> Service: ...
def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
def get_generator(
title: str, components: Optional[List[Type[AttrsInstance]]] = ..., tags: Optional[List[Dict[str, Any]]] = ..., inject: bool = ...
title: str,
components: Optional[List[Type[AttrsInstance]]] = ...,
tags: Optional[List[Dict[str, Any]]] = ...,
inject: bool = ...,
) -> OpenLLMSchemaGenerator: ...

View File

@@ -1,272 +0,0 @@
from __future__ import annotations
import functools, json, logging, traceback
from http import HTTPStatus
import orjson
from starlette.applications import Starlette
from starlette.responses import JSONResponse, StreamingResponse
from starlette.routing import Route
from openllm_core.utils import DEBUG, converter, gen_random_uuid
from ._openapi import add_schema_definitions, append_schemas, get_generator
from ..protocol.cohere import (
Chat,
ChatStreamEnd,
ChatStreamStart,
ChatStreamTextGeneration,
CohereChatRequest,
CohereErrorResponse,
CohereGenerateRequest,
Generation,
Generations,
StreamingGenerations,
StreamingText,
)
schemas = get_generator(
'cohere',
components=[
CohereChatRequest,
CohereErrorResponse,
CohereGenerateRequest,
Generation,
Generations,
StreamingGenerations,
StreamingText,
Chat,
ChatStreamStart,
ChatStreamEnd,
ChatStreamTextGeneration,
],
tags=[
{
'name': 'Cohere',
'description': 'Cohere compatible API. Currently support /generate, /chat',
'externalDocs': 'https://docs.cohere.com/docs/the-cohere-platform',
}
],
inject=DEBUG,
)
logger = logging.getLogger(__name__)
def jsonify_attr(obj):
return json.dumps(converter.unstructure(obj))
def error_response(status_code, message):
return JSONResponse(converter.unstructure(CohereErrorResponse(text=message)), status_code=status_code.value)
async def check_model(request, model):
if request.model is None or request.model == model:
return None
return error_response(HTTPStatus.NOT_FOUND, f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.")
def mount_to_svc(svc, llm):
app = Starlette(
debug=True,
routes=[
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
],
)
mount_path = '/cohere'
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
@add_schema_definitions
async def cohere_generate(req, llm):
json_str = await req.body()
try:
request = converter.structure(orjson.loads(json_str), CohereGenerateRequest)
except orjson.JSONDecodeError as err:
logger.debug('Sent body: %s', json_str)
logger.error('Invalid JSON input received: %s', err)
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
logger.debug('Received generate request: %s', request)
err_check = await check_model(request, llm.llm_type)
if err_check is not None:
return err_check
request_id = gen_random_uuid('cohere-generate')
config = llm.config.compatible_options(request)
if request.prompt_vars is not None:
prompt = request.prompt.format(**request.prompt_vars)
else:
prompt = request.prompt
# TODO: support end_sequences, stop_sequences, logit_bias, return_likelihoods, truncate
try:
result_generator = llm.generate_iterator(prompt, request_id=request_id, stop=request.stop_sequences, **config)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
def create_stream_response_json(index, text, is_finished):
return f'{jsonify_attr(StreamingText(index=index, text=text, is_finished=is_finished))}\n'
async def generate_stream_generator():
async for res in result_generator:
for output in res.outputs:
yield create_stream_response_json(index=output.index, text=output.text, is_finished=output.finish_reason)
try:
# streaming case
if request.stream:
return StreamingResponse(generate_stream_generator(), media_type='text/event-stream')
# None-streaming case
final_result = None
texts, token_ids = [[]] * config['n'], [[]] * config['n']
async for res in result_generator:
if await req.is_disconnected():
return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
for output in res.outputs:
texts[output.index].append(output.text)
token_ids[output.index].extend(output.token_ids)
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
)
return JSONResponse(
converter.unstructure(
Generations(
id=request_id,
generations=[
Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason) for output in final_result.outputs
],
)
),
status_code=HTTPStatus.OK.value,
)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
def _transpile_cohere_chat_messages(request: CohereChatRequest) -> list[dict[str, str]]:
def convert_role(role):
return {'User': 'user', 'Chatbot': 'assistant'}[role]
chat_history = request.chat_history
if chat_history:
messages = [{'role': convert_role(msg['role']), 'content': msg['message']} for msg in chat_history]
else:
messages = []
messages.append({'role': 'user', 'content': request.message})
return messages
@add_schema_definitions
async def cohere_chat(req, llm):
json_str = await req.body()
try:
request = converter.structure(orjson.loads(json_str), CohereChatRequest)
except orjson.JSONDecodeError as err:
logger.debug('Sent body: %s', json_str)
logger.error('Invalid JSON input received: %s', err)
return error_response(HTTPStatus.BAD_REQUEST, 'Invalid JSON input received (Check server log).')
logger.debug('Received chat completion request: %s', request)
err_check = await check_model(request, llm.llm_type)
if err_check is not None:
return err_check
request_id = gen_random_uuid('cohere-chat')
prompt: str = llm.tokenizer.apply_chat_template(
_transpile_cohere_chat_messages(request), tokenize=False, add_generation_prompt=llm.config['add_generation_prompt']
)
logger.debug('Prompt: %r', prompt)
config = llm.config.compatible_options(request)
try:
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')
def create_stream_generation_json(index: int, text: str, is_finished: bool) -> str:
return f'{jsonify_attr(ChatStreamTextGeneration(index=index, text=text, is_finished=is_finished))}\n'
async def completion_stream_generator():
texts, token_ids = [], []
yield f'{jsonify_attr(ChatStreamStart(is_finished=False, index=0, generation_id=request_id))}\n'
it = None
async for res in result_generator:
yield create_stream_generation_json(index=res.outputs[0].index, text=res.outputs[0].text, is_finished=False)
texts.append(res.outputs[0].text)
token_ids.extend(res.outputs[0].token_ids)
it = res
if it is None:
raise ValueError('No response from model.')
num_prompt_tokens, num_response_tokens = len(it.prompt_token_ids), len(token_ids)
json_str = jsonify_attr(
ChatStreamEnd(
is_finished=True,
finish_reason='COMPLETE',
index=0,
response=Chat(
response_id=request_id,
message=request.message,
text=''.join(texts),
prompt=prompt,
chat_history=request.chat_history,
token_count={
'prompt_tokens': num_prompt_tokens,
'response_tokens': num_response_tokens,
'total_tokens': num_prompt_tokens + num_response_tokens,
},
),
)
)
yield f'{json_str}\n'
try:
if request.stream:
return StreamingResponse(completion_stream_generator(), media_type='text/event-stream')
# Non-streaming case
final_result = None
texts, token_ids = [], []
async for res in result_generator:
if await req.is_disconnected():
return error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected.')
texts.append(res.outputs[0].text)
token_ids.extend(res.outputs[0].token_ids)
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)])
num_prompt_tokens, num_response_tokens = len(final_result.prompt_token_ids), len(token_ids)
return JSONResponse(
converter.unstructure(
Chat(
response_id=request_id,
message=request.message,
text=''.join(texts),
prompt=prompt,
chat_history=request.chat_history,
token_count={
'prompt_tokens': num_prompt_tokens,
'response_tokens': num_response_tokens,
'total_tokens': num_prompt_tokens + num_response_tokens,
},
)
),
status_code=HTTPStatus.OK.value,
)
except Exception as err:
traceback.print_exc()
logger.error('Error generating completion: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, f'Exception: {err!s} (check server log)')

View File

@@ -1,19 +0,0 @@
from http import HTTPStatus
from typing import Optional, Union
from attr import AttrsInstance
from starlette.requests import Request
from starlette.responses import JSONResponse, Response
from bentoml import Service
from openllm_core._typing_compat import M, T
from .._llm import LLM
from ..protocol.cohere import CohereChatRequest, CohereGenerateRequest
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
def jsonify_attr(obj: AttrsInstance) -> str: ...
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
async def check_model(request: Union[CohereGenerateRequest, CohereChatRequest], model: str) -> Optional[JSONResponse]: ...
async def cohere_generate(req: Request, llm: LLM[M, T]) -> Response: ...
async def cohere_chat(req: Request, llm: LLM[M, T]) -> Response: ...

View File

@@ -27,7 +27,6 @@ def mount_to_svc(svc, llm):
debug=True,
routes=[
Route('/agent', endpoint=functools.partial(hf_agent, llm=llm), name='hf_agent', methods=['POST']),
Route('/adapters', endpoint=functools.partial(hf_adapters, llm=llm), name='adapters', methods=['GET']),
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
],
)
@@ -37,7 +36,10 @@ def mount_to_svc(svc, llm):
def error_response(status_code, message):
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
return JSONResponse(
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
status_code=status_code.value,
)
@add_schema_definitions
@@ -53,20 +55,9 @@ async def hf_agent(req, llm):
stop = request.parameters.pop('stop', [])
try:
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
return JSONResponse(
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
)
except Exception as err:
logger.error('Error while generating: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
@add_schema_definitions
def hf_adapters(req, llm):
if not llm.has_adapters:
return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
return JSONResponse(
{
adapter_tuple[1]: {'adapter_name': k, 'adapter_type': adapter_tuple[0].peft_type.value}
for k, adapter_tuple in dict(*llm.adapter_map.values()).items()
},
status_code=HTTPStatus.OK.value,
)

View File

@@ -12,7 +12,7 @@ from starlette.routing import Route
from openllm_core.utils import converter, gen_random_uuid
from ._openapi import add_schema_definitions, append_schemas, apply_schema, get_generator
from ..protocol.openai import (
from openllm_core.protocol.openai import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseChoice,
@@ -61,7 +61,11 @@ def jsonify_attr(obj):
def error_response(status_code, message):
return JSONResponse(
{'error': converter.unstructure(ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value)))},
{
'error': converter.unstructure(
ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
)
},
status_code=status_code.value,
)
@@ -95,7 +99,11 @@ def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initi
logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
last_token_len = len(token)
if num_output_top_logprobs:
logprobs.top_logprobs.append({llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()} if step_top_logprobs else None)
logprobs.top_logprobs.append(
{llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
if step_top_logprobs
else None
)
return logprobs
@@ -106,8 +114,14 @@ def mount_to_svc(svc, llm):
app = Starlette(
debug=True,
routes=[
Route('/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']),
Route('/completions', functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm), methods=['POST']),
Route(
'/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
),
Route(
'/completions',
functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
methods=['POST'],
),
Route(
'/chat/completions',
functools.partial(
@@ -132,7 +146,9 @@ def mount_to_svc(svc, llm):
# GET /v1/models
@add_schema_definitions
def list_models(_, llm):
return JSONResponse(converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value)
return JSONResponse(
converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
)
# POST /v1/chat/completions
@@ -166,7 +182,9 @@ async def chat_completions(req, llm):
config = llm.config.compatible_options(request)
def get_role() -> str:
return request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant' # TODO: Support custom role here.
return (
request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
) # TODO: Support custom role here.
try:
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
@@ -180,7 +198,9 @@ async def chat_completions(req, llm):
id=request_id,
created=created_time,
model=model_name,
choices=[ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)],
choices=[
ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
],
)
if usage is not None:
response.usage = usage
@@ -230,13 +250,20 @@ async def chat_completions(req, llm):
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
final_result = final_result.model_copy(
update=dict(
outputs=[
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
for output in final_result.outputs
]
)
)
role = get_role()
choices = [
ChatCompletionResponseChoice(index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason)
ChatCompletionResponseChoice(
index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
)
for output in final_result.outputs
]
if request.echo:
@@ -250,7 +277,9 @@ async def chat_completions(req, llm):
num_prompt_tokens = len(final_result.prompt_token_ids)
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
response = ChatCompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
response = ChatCompletionResponse(
id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
)
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
except Exception as err:
traceback.print_exc()
@@ -342,7 +371,13 @@ async def completions(req, llm):
top_logprobs = res.prompt_logprobs
previous_echo[i] = True
if request.logprobs is not None:
logprobs = create_logprobs(output.token_ids, output.logprobs[previous_num_tokens[i] :], request.logprobs, len(previous_texts[i]), llm=llm)
logprobs = create_logprobs(
output.token_ids,
output.logprobs[previous_num_tokens[i] :],
request.logprobs,
len(previous_texts[i]),
llm=llm,
)
previous_num_tokens[i] += len(output.token_ids)
previous_texts[i] += output.text
yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
@@ -368,8 +403,13 @@ async def completions(req, llm):
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
final_result = final_result.model_copy(
update=dict(
outputs=[
output.model_copy(update=dict(text=''.join(texts[output.index]), token_ids=token_ids[output.index]))
for output in final_result.outputs
]
)
)
choices = []
@@ -392,7 +432,9 @@ async def completions(req, llm):
output_text = prompt_text + output_text
else:
output_text = prompt_text
choice_data = CompletionResponseChoice(index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason)
choice_data = CompletionResponseChoice(
index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
)
choices.append(choice_data)
num_prompt_tokens = len(final_result.prompt_token_ids)

View File

@@ -14,7 +14,9 @@ from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
def jsonify_attr(obj: AttrsInstance) -> str: ...
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
async def check_model(request: Union[CompletionRequest, ChatCompletionRequest], model: str) -> Optional[JSONResponse]: ...
async def check_model(
request: Union[CompletionRequest, ChatCompletionRequest], model: str
) -> Optional[JSONResponse]: ...
def create_logprobs(
token_ids: List[int],
top_logprobs: List[Dict[int, float]], #

View File

@@ -7,4 +7,5 @@ from openllm_core.exceptions import (
ValidationError as ValidationError, #
MissingAnnotationAttributeError as MissingAnnotationAttributeError,
MissingDependencyError as MissingDependencyError,
ModelNotFound as ModelNotFound, #
)

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
import openllm, transformers, typing as t
import openllm, typing as t
def load_model(llm: openllm.LLM, config: transformers.PretrainedConfig, **attrs: t.Any): ...
def load_model(llm: openllm.LLM, *args: t.Any, **attrs: t.Any): ...

View File

@@ -1,11 +0,0 @@
from __future__ import annotations
import os
import typing as t
from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}
if t.TYPE_CHECKING:
from . import cohere as cohere, hf as hf, openai as openai
__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -1,154 +0,0 @@
from __future__ import annotations
import typing as t
from enum import Enum
import attr
from openllm_core.utils import converter
@attr.define
class CohereErrorResponse:
text: str
converter.register_unstructure_hook(CohereErrorResponse, lambda obj: obj.text)
@attr.define
class CohereGenerateRequest:
prompt: str
prompt_vars: t.Optional[t.Dict[str, t.Any]] = None
model: t.Optional[str] = None
preset: t.Optional[str] = None
num_generations: t.Optional[int] = None
max_tokens: t.Optional[int] = None
temperature: t.Optional[float] = None
k: t.Optional[int] = None
p: t.Optional[float] = None
frequency_penalty: t.Optional[float] = None
presence_penalty: t.Optional[float] = None
end_sequences: t.Optional[t.List[str]] = None
stop_sequences: t.Optional[t.List[str]] = None
return_likelihoods: t.Optional[t.Literal['GENERATION', 'ALL', 'NONE']] = None
truncate: t.Optional[str] = None
logit_bias: t.Optional[t.Dict[int, float]] = None
stream: bool = False
@attr.define
class TokenLikelihood: # pretty sure this is similar to token_logprobs
token: str
likelihood: float
@attr.define
class Generation:
id: str
text: str
prompt: str
likelihood: t.Optional[float] = None
token_likelihoods: t.List[TokenLikelihood] = attr.field(factory=list)
finish_reason: t.Optional[str] = None
@attr.define
class Generations:
id: str
generations: t.List[Generation]
meta: t.Optional[t.Dict[str, t.Any]] = None
@attr.define
class StreamingText:
index: int
text: str
is_finished: bool
@attr.define
class StreamingGenerations:
id: str
generations: Generations
texts: t.List[str]
meta: t.Optional[t.Dict[str, t.Any]] = None
@attr.define
class CohereChatRequest:
message: str
conversation_id: t.Optional[str] = ''
model: t.Optional[str] = None
return_chat_history: t.Optional[bool] = False
return_prompt: t.Optional[bool] = False
return_preamble: t.Optional[bool] = False
chat_history: t.Optional[t.List[t.Dict[str, str]]] = None
preamble_override: t.Optional[str] = None
user_name: t.Optional[str] = None
temperature: t.Optional[float] = 0.8
max_tokens: t.Optional[int] = None
stream: t.Optional[bool] = False
p: t.Optional[float] = None
k: t.Optional[float] = None
logit_bias: t.Optional[t.Dict[int, float]] = None
search_queries_only: t.Optional[bool] = None
documents: t.Optional[t.List[t.Dict[str, t.Any]]] = None
citation_quality: t.Optional[str] = None
prompt_truncation: t.Optional[str] = None
connectors: t.Optional[t.List[t.Dict[str, t.Any]]] = None
class StreamEvent(str, Enum):
STREAM_START = 'stream-start'
TEXT_GENERATION = 'text-generation'
STREAM_END = 'stream-end'
# TODO: The following are yet to be implemented
SEARCH_QUERIES_GENERATION = 'search-queries-generation'
SEARCH_RESULTS = 'search-results'
CITATION_GENERATION = 'citation-generation'
@attr.define
class Chat:
response_id: str
message: str
text: str
generation_id: t.Optional[str] = None
conversation_id: t.Optional[str] = None
meta: t.Optional[t.Dict[str, t.Any]] = None
prompt: t.Optional[str] = None
chat_history: t.Optional[t.List[t.Dict[str, t.Any]]] = None
preamble: t.Optional[str] = None
token_count: t.Optional[t.Dict[str, int]] = None
is_search_required: t.Optional[bool] = None
citations: t.Optional[t.List[t.Dict[str, t.Any]]] = None
documents: t.Optional[t.List[t.Dict[str, t.Any]]] = None
search_results: t.Optional[t.List[t.Dict[str, t.Any]]] = None
search_queries: t.Optional[t.List[t.Dict[str, t.Any]]] = None
@attr.define
class ChatStreamResponse:
is_finished: bool
event_type: StreamEvent
index: int
@attr.define
class ChatStreamStart(ChatStreamResponse):
generation_id: str
conversation_id: t.Optional[str] = None
event_type: StreamEvent = StreamEvent.STREAM_START
@attr.define
class ChatStreamTextGeneration(ChatStreamResponse):
text: str
event_type: StreamEvent = StreamEvent.TEXT_GENERATION
@attr.define
class ChatStreamEnd(ChatStreamResponse):
finish_reason: str
response: Chat
event_type: StreamEvent = StreamEvent.STREAM_END

View File

@@ -1,21 +0,0 @@
from __future__ import annotations
import typing as t
import attr
@attr.define
class AgentRequest:
inputs: str
parameters: t.Dict[str, t.Any]
@attr.define
class AgentResponse:
generated_text: str
@attr.define
class HFErrorResponse:
error_code: int
message: str

View File

@@ -1,188 +0,0 @@
from __future__ import annotations
import time
import typing as t
import attr
import openllm_core
from openllm_core._schemas import FinishReason
from openllm_core.utils import converter
@attr.define
class ErrorResponse:
message: str
type: str
object: str = 'error'
param: t.Optional[str] = None
code: t.Optional[str] = None
def _stop_converter(data: t.Union[str, t.List[str]]) -> t.List[str]:
if not data:
return None
return [data] if isinstance(data, str) else data
@attr.define
class CompletionRequest:
prompt: str
model: str = attr.field(default=None)
suffix: t.Optional[str] = attr.field(default=None)
max_tokens: t.Optional[int] = attr.field(default=16)
temperature: t.Optional[float] = attr.field(default=1.0)
top_p: t.Optional[float] = attr.field(default=1.0)
n: t.Optional[int] = attr.field(default=1)
stream: t.Optional[bool] = attr.field(default=False)
logprobs: t.Optional[int] = attr.field(default=None)
echo: t.Optional[bool] = attr.field(default=False)
stop: t.Optional[t.Union[str, t.List[str]]] = attr.field(default=None, converter=_stop_converter)
presence_penalty: t.Optional[float] = attr.field(default=0.0)
frequency_penalty: t.Optional[float] = attr.field(default=0.0)
logit_bias: t.Optional[t.Dict[str, float]] = attr.field(default=None)
user: t.Optional[str] = attr.field(default=None)
# supported by vLLM and us
top_k: t.Optional[int] = attr.field(default=None)
best_of: t.Optional[int] = attr.field(default=1)
@attr.define
class ChatCompletionRequest:
messages: t.List[t.Dict[str, str]]
model: str = attr.field(default=None)
functions: t.List[t.Dict[str, str]] = attr.field(default=attr.Factory(list))
function_calls: t.List[t.Dict[str, str]] = attr.field(default=attr.Factory(list))
temperature: t.Optional[float] = attr.field(default=None)
top_p: t.Optional[float] = attr.field(default=None)
n: t.Optional[int] = attr.field(default=None)
stream: t.Optional[bool] = attr.field(default=False)
stop: t.Optional[t.Union[str, t.List[str]]] = attr.field(default=None, converter=_stop_converter)
max_tokens: t.Optional[int] = attr.field(default=None)
presence_penalty: t.Optional[float] = attr.field(default=None)
frequency_penalty: t.Optional[float] = attr.field(default=None)
echo: t.Optional[bool] = attr.field(default=False)
logit_bias: t.Optional[t.Dict[str, float]] = attr.field(default=None)
user: t.Optional[str] = attr.field(default=None)
# supported by vLLM and us
top_k: t.Optional[int] = attr.field(default=None)
best_of: t.Optional[int] = attr.field(default=1)
# Additional features to support chat_template
chat_template: str = attr.field(default=None)
add_generation_prompt: bool = attr.field(default=True)
@attr.define
class LogProbs:
text_offset: t.List[int] = attr.field(default=attr.Factory(list))
token_logprobs: t.List[float] = attr.field(default=attr.Factory(list))
tokens: t.List[str] = attr.field(default=attr.Factory(list))
top_logprobs: t.List[t.Dict[str, t.Any]] = attr.field(default=attr.Factory(list))
@attr.define
class UsageInfo:
prompt_tokens: int = attr.field(default=0)
completion_tokens: int = attr.field(default=0)
total_tokens: int = attr.field(default=0)
@attr.define
class CompletionResponseChoice:
index: int
text: str
logprobs: t.Optional[LogProbs] = None
finish_reason: t.Optional[FinishReason] = None
@attr.define
class CompletionResponseStreamChoice:
index: int
text: str
logprobs: t.Optional[LogProbs] = None
finish_reason: t.Optional[FinishReason] = None
@attr.define
class CompletionStreamResponse:
model: str
choices: t.List[CompletionResponseStreamChoice]
object: str = 'text_completion'
id: str = attr.field(default=attr.Factory(lambda: openllm_core.utils.gen_random_uuid('cmpl')))
created: int = attr.field(default=attr.Factory(lambda: int(time.monotonic())))
usage: t.Optional[UsageInfo] = attr.field(default=None)
@attr.define
class CompletionResponse:
choices: t.List[CompletionResponseChoice]
model: str
usage: UsageInfo
object: str = 'text_completion'
id: str = attr.field(default=attr.Factory(lambda: openllm_core.utils.gen_random_uuid('cmpl')))
created: int = attr.field(default=attr.Factory(lambda: int(time.monotonic())))
LiteralRole = t.Literal['system', 'user', 'assistant']
@attr.define
class Delta:
role: t.Optional[LiteralRole] = None
content: t.Optional[str] = None
@attr.define
class ChatMessage:
role: LiteralRole
content: str
converter.register_unstructure_hook(ChatMessage, lambda msg: {'role': msg.role, 'content': msg.content})
@attr.define
class ChatCompletionResponseStreamChoice:
index: int
delta: Delta
finish_reason: t.Optional[FinishReason] = None
@attr.define
class ChatCompletionResponseChoice:
index: int
message: ChatMessage
finish_reason: t.Optional[FinishReason] = None
@attr.define
class ChatCompletionResponse:
choices: t.List[ChatCompletionResponseChoice]
model: str
object: str = 'chat.completion'
id: str = attr.field(default=attr.Factory(lambda: openllm_core.utils.gen_random_uuid('chatcmpl')))
created: int = attr.field(default=attr.Factory(lambda: int(time.monotonic())))
usage: UsageInfo = attr.field(default=attr.Factory(lambda: UsageInfo()))
@attr.define
class ChatCompletionStreamResponse:
choices: t.List[ChatCompletionResponseStreamChoice]
model: str
object: str = 'chat.completion.chunk'
id: str = attr.field(default=attr.Factory(lambda: openllm_core.utils.gen_random_uuid('chatcmpl')))
created: int = attr.field(default=attr.Factory(lambda: int(time.monotonic())))
usage: t.Optional[UsageInfo] = attr.field(default=None)
@attr.define
class ModelCard:
id: str
object: str = 'model'
created: int = attr.field(default=attr.Factory(lambda: int(time.monotonic())))
owned_by: str = 'na'
@attr.define
class ModelList:
object: str = 'list'
data: t.List[ModelCard] = attr.field(factory=list)

View File

@@ -1,16 +1,43 @@
from __future__ import annotations
import importlib, typing as t
from openllm_core._typing_compat import M, ParamSpec, T, TypeGuard, Concatenate
import importlib, logging, inspect, typing as t
from openllm_core._typing_compat import ParamSpec, Concatenate, TypeGuard
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import (
apply,
first_not_none,
generate_hash_from_file,
resolve_filepath,
validate_is_path,
normalise_model_name,
)
if t.TYPE_CHECKING:
from bentoml import Model
from .._llm import LLM
P = ParamSpec('P')
M = t.TypeVar('M')
T = t.TypeVar('T')
logger = logging.getLogger(__name__)
def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
def _make_tag_components(model_id: str, model_version: t.Optional[str]) -> t.Tuple[str, t.Optional[str]]:
model_id, *maybe_revision = model_id.rsplit(':')
if len(maybe_revision) > 0:
if model_version is not None:
logger.warning(
"revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
)
model_version = maybe_revision[0]
if validate_is_path(model_id):
model_id = resolve_filepath(model_id)
model_version = first_not_none(model_version, default=generate_hash_from_file(model_id))
return normalise_model_name(model_id), model_version
def load_tokenizer(llm, **tokenizer_attrs):
import cloudpickle, fs, transformers
from bentoml._internal.models.model import CUSTOM_OBJECTS_FILENAME
from .transformers._helpers import process_config
@@ -30,7 +57,9 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
'For example: "bentoml.transformers.save_model(..., custom_objects={\'tokenizer\': tokenizer})"'
) from None
else:
tokenizer = transformers.AutoTokenizer.from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
tokenizer = transformers.AutoTokenizer.from_pretrained(
bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs
)
if tokenizer.pad_token_id is None:
if config.pad_token_id is not None:
@@ -42,32 +71,34 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
return tokenizer
def _make_dispatch_function(fn: str) -> t.Callable[Concatenate[LLM[M, T], P], TypeGuard[M | T | Model]]:
def caller(llm: LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> TypeGuard[M | T | Model]:
def _make_dispatch_function(fn: str) -> t.Callable[Concatenate[LLM[M, T], P], TypeGuard[t.Union[M, T, Model]]]:
def caller(llm: t.Optional[LLM[M, T]] = None, *args: P.args, **kwargs: P.kwargs) -> TypeGuard[t.Union[M, T, Model]]:
"""Generic function dispatch to correct serialisation submodules based on LLM runtime.
> [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "vllm")'
> [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
> [!NOTE] See 'openllm.serialisation.ctranslate' if 'llm.__llm_backend__="ctranslate"'
"""
if llm.__llm_backend__ == 'ggml':
serde = 'ggml'
elif llm.__llm_backend__ == 'ctranslate':
serde = 'ctranslate'
elif llm.__llm_backend__ in {'pt', 'vllm'}:
serde = 'transformers'
else:
raise OpenLLMException(f'Not supported backend {llm.__llm_backend__}')
return getattr(importlib.import_module(f'.{serde}', 'openllm.serialisation'), fn)(llm, *args, **kwargs)
backend = kwargs.get('_backend', None)
if backend is None:
if llm is None:
raise OpenLLMException('Cannot dispatch without LLM instance.')
backend = llm.__llm_backend__
serde_mapping = {'pt': 'transformers', 'vllm': 'vllm', 'ggml': 'ggml'}
try:
serde = serde_mapping[backend]
except KeyError:
raise OpenLLMException(f'Not supported backend {backend}')
call = getattr(importlib.import_module(f'.{serde}', 'openllm.serialisation'), fn)
params = inspect.signature(call).parameters
return call(llm, *args, **kwargs) if next(iter(params.keys())) == 'llm' else call(*args, **kwargs)
return caller
_extras = ['get', 'import_model', 'load_model']
_import_structure = {'ggml', 'transformers', 'ctranslate', 'constants'}
__all__ = ['load_tokenizer', *_extras, *_import_structure]
_extras = ['import_model', 'load_model']
_import_structure = {'ggml', 'transformers', 'vllm', 'constants'}
__all__ = ['_make_tag_components', 'load_tokenizer', *_extras, *_import_structure]
def __dir__() -> t.Sequence[str]:
@@ -75,9 +106,7 @@ def __dir__() -> t.Sequence[str]:
def __getattr__(name: str) -> t.Any:
if name == 'load_tokenizer':
return load_tokenizer
elif name in _import_structure:
if name in _import_structure:
return importlib.import_module(f'.{name}', __name__)
elif name in _extras:
return _make_dispatch_function(name)

View File

@@ -6,18 +6,10 @@ Currently, GGML format is working in progress.
"""
from typing import Any
from bentoml import Model
from openllm import LLM
from openllm_core._typing_compat import M, T
from . import constants as constants, ggml as ggml, transformers as transformers
from bentoml import Model as _Model
from openllm import LLM as _LLM
from . import constants as constants, ggml as ggml, transformers as transformers, vllm as vllm
def load_tokenizer(llm: LLM[M, T], **attrs: Any) -> T:
"""Load the tokenizer from BentoML store.
By default, it will try to find the bentomodel whether it is in store..
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
def get(llm: LLM[M, T]) -> Model: ...
def import_model(llm: LLM[M, T], *args: Any, trust_remote_code: bool, **attrs: Any) -> Model: ...
def load_model(llm: LLM[M, T], *args: Any, **attrs: Any) -> M: ...
def import_model(*args: Any, trust_remote_code: bool, **attrs: Any) -> _Model: ...
def load_model(llm: _LLM, *args: Any, **attrs: Any) -> Any: ...
def load_tokenizer(llm: _LLM, **attrs: Any) -> Any: ...

View File

@@ -1,18 +1,21 @@
import contextlib, attr
from __future__ import annotations
import contextlib, attr, bentoml, openllm, types, logging, typing as t
from simple_di import Provide, inject
import bentoml, openllm
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelOptions, ModelSignature
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import is_autogptq_available
from openllm_core._typing_compat import LiteralSerialisation, LiteralQuantise, LiteralBackend
if t.TYPE_CHECKING:
import transformers
from bentoml._internal.models import ModelStore
_object_setattr = object.__setattr__
logger = logging.getLogger(__name__)
def get_hash(config) -> str:
def get_hash(config: transformers.PretrainedConfig) -> str:
_commit_hash = getattr(config, '_commit_hash', None)
if _commit_hash is None:
raise ValueError(f'Cannot find commit hash in {config}')
logger.warning('Cannot find commit hash in %r', config)
return _commit_hash
@@ -29,7 +32,9 @@ def patch_correct_tag(llm, config, _revision=None) -> None:
if _revision is None and llm.tag.version is not None:
_revision = llm.tag.version
if llm.tag.version is None:
_object_setattr(llm, '_tag', attr.evolve(llm.tag, version=_revision)) # HACK: This copies the correct revision into llm.tag
_object_setattr(
llm, '_tag', attr.evolve(llm.tag, version=_revision)
) # HACK: This copies the correct revision into llm.tag
if llm._revision is None:
_object_setattr(llm, '_revision', _revision) # HACK: This copies the correct revision into llm._model_version
@@ -37,7 +42,7 @@ def patch_correct_tag(llm, config, _revision=None) -> None:
def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadata=None):
if metadata is None:
metadata = {}
metadata.update({'safe_serialisation': safe_serialisation, '_framework': llm.__llm_backend__})
metadata.update({'_framework': llm.__llm_backend__})
if llm.quantise:
metadata['_quantize'] = llm.quantise
architectures = getattr(config, 'architectures', [])
@@ -45,7 +50,9 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
if trust_remote_code:
auto_map = getattr(config, 'auto_map', {})
if not auto_map:
raise RuntimeError(f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}')
raise RuntimeError(
f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}'
)
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if autoclass not in auto_map:
raise RuntimeError(
@@ -56,89 +63,97 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
metadata.update({'_pretrained_class': architectures[0], '_revision': get_hash(config) if not llm.local else llm.revision})
metadata.update({
'_pretrained_class': architectures[0],
'_revision': get_hash(config) if not llm.local else llm.revision,
'_local': llm.local,
'serialisation': llm._serialisation,
'model_name': llm.config['model_name'],
'architecture': llm.config['architecture'],
'model_id': llm.model_id,
})
return metadata
def _create_signatures(llm, signatures=None):
if signatures is None:
signatures = {}
if llm.__llm_backend__ == 'pt':
if llm.quantise == 'gptq':
if not is_autogptq_available():
raise OpenLLMException("Requires 'auto-gptq' and 'optimum'. Install it with 'pip install \"openllm[gptq]\"'")
signatures['generate'] = {'batchable': False}
else:
signatures.update({
k: ModelSignature(batchable=False)
for k in (
'__call__',
'forward',
'generate', #
'contrastive_search',
'greedy_search', #
'sample',
'beam_search',
'beam_sample', #
'group_beam_search',
'constrained_beam_search', #
)
})
elif llm.__llm_backend__ == 'ctranslate':
if llm.config['model_type'] == 'seq2seq_lm':
non_batch_keys = {'score_file', 'translate_file'}
batch_keys = {'generate_tokens', 'score_batch', 'translate_batch', 'translate_iterable', 'score_iterable'}
else:
non_batch_keys = set()
batch_keys = {
'async_generate_tokens',
'forward_batch',
'generate_batch', #
'generate_iterable',
'generate_tokens',
'score_batch',
'score_iterable', #
}
signatures.update({k: ModelSignature(batchable=False) for k in non_batch_keys})
signatures.update({k: ModelSignature(batchable=True) for k in batch_keys})
return signatures
@attr.define(init=False)
class _Model(bentoml.Model):
_imported_modules: t.List[types.ModuleType] = None
@property
def imported_modules(self):
if self._imported_modules is None:
self._imported_modules = []
return self._imported_modules
@imported_modules.setter
def imported_modules(self, value):
self._imported_modules = value
@classmethod
def create(cls, tag, *, module, api_version, labels=None, metadata=None):
return super().create(
tag,
module=module,
api_version=api_version,
signatures={},
labels=labels,
metadata=metadata,
context=openllm.utils.generate_context('openllm'),
)
@inject
@contextlib.contextmanager
def save_model(
llm,
config,
safe_serialisation, #
trust_remote_code,
module,
external_modules, #
_model_store=Provide[BentoMLContainer.model_store],
_api_version='v2.1.0', #
):
tag: bentoml.Tag,
config: transformers.PretrainedConfig,
serialisation: LiteralSerialisation, #
trust_remote_code: bool,
module: str,
external_modules: list[types.ModuleType], #
model_id: str,
quantise: LiteralQuantise,
backend: LiteralBackend,
_local: bool,
_dtype: str,
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
_api_version: str = 'v3.0.0', #
) -> bentoml.Model:
imported_modules = []
bentomodel = bentoml.Model.create(
llm.tag,
module=f'openllm.serialisation.{module}', #
architectures = getattr(config, 'architectures', [])
_metadata = {
'model_id': model_id,
'backend': backend,
'dtype': _dtype,
'architectures': architectures,
'_revision': get_hash(config) or tag.version,
'_local': _local,
'serialisation': serialisation,
}
if quantise:
_metadata['_quantize'] = quantise
bentomodel = _Model.create(
tag,
module=f'openllm.serialisation.{module}',
api_version=_api_version,
options=ModelOptions(), #
context=openllm.utils.generate_context('openllm'),
labels=openllm.utils.generate_labels(llm),
metadata=_create_metadata(llm, config, safe_serialisation, trust_remote_code),
signatures=_create_signatures(llm),
labels=openllm.utils.generate_labels(serialisation),
metadata=_metadata,
)
with openllm.utils.analytics.set_bentoml_tracking():
try:
bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
yield bentomodel, imported_modules
bentomodel.imported_modules = imported_modules
yield bentomodel
except Exception:
raise
else:
bentomodel.flush()
bentomodel.save(_model_store)
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)
openllm.utils.analytics.ModelSaveEvent(
module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024
)
)
finally:
bentomodel.exit_cloudpickle_context(imported_modules)
bentomodel.exit_cloudpickle_context(bentomodel.imported_modules)
return bentomodel

View File

@@ -1,22 +0,0 @@
import types
from contextlib import contextmanager
from typing import Iterator, Optional, Sequence, Tuple
import transformers
from bentoml import Model
from openllm_core._typing_compat import M, T
from .._llm import LLM
def get_hash(config: transformers.PretrainedConfig) -> str: ...
def patch_correct_tag(llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...) -> None: ...
@contextmanager
def save_model(
llm: LLM[M, T],
config: transformers.PretrainedConfig,
safe_serialisation: bool,
trust_remote_code: bool,
module: str,
external_modules: Sequence[types.ModuleType],
) -> Iterator[Tuple[Model, Sequence[types.ModuleType]]]: ...

View File

@@ -1,90 +0,0 @@
import importlib
import logging
import shutil
import transformers
import bentoml
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import is_ctranslate_available
from .._helpers import patch_correct_tag, save_model
from ..transformers._helpers import get_tokenizer, process_config
if not is_ctranslate_available():
raise RuntimeError("'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'")
import ctranslate2
from ctranslate2.converters.transformers import TransformersConverter
logger = logging.getLogger(__name__)
def _get_class(llm):
return ctranslate2.Translator if llm.config['model_type'] == 'seq2seq_lm' else ctranslate2.Generator
def import_model(llm, *decls, trust_remote_code, **attrs):
(_base_decls, _base_attrs), tokenizer_attrs = llm.llm_parameters
for it in {'device_map', 'torch_dtype'}:
_base_attrs.pop(it, None) # pop out hf-specific attributes
decls = (*_base_decls, *decls)
attrs = {**_base_attrs, **attrs}
low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True)
logger.debug(
'Note that CTranslate2 will load into memory for conversion. Refer to https://opennmt.net/CTranslate2/guides/transformers.html for more information.'
)
if not llm._local:
logger.warning(
"It is RECOMMENDED to convert '%s' to CTranslate2 format yourself to utilise CTranslate2's features, then start with `openllm start /path/to/ct2-dir`. OpenLLM will conservely apply quantization for conversion if specified.",
llm.model_id,
)
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
patch_correct_tag(llm, config)
tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
with save_model(llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]) as save_metadata:
bentomodel, _ = save_metadata
if llm._local:
shutil.copytree(
llm.model_id, bentomodel.path, symlinks=False, ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'), dirs_exist_ok=True
)
else:
TransformersConverter(
llm.model_id,
load_as_float16=llm.quantise in ('float16', 'int8_float16'),
low_cpu_mem_usage=low_cpu_mem_usage,
trust_remote_code=trust_remote_code,
).convert(bentomodel.path, quantization=llm.quantise, force=True)
# Save the original HF configuration to hf
config.save_pretrained(bentomodel.path_of('/hf/'))
tokenizer.save_pretrained(bentomodel.path)
return bentomodel
def get(llm):
try:
model = bentoml.models.get(llm.tag)
backend = model.info.labels['backend']
if backend != llm.__llm_backend__:
raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
patch_correct_tag(
llm,
transformers.AutoConfig.from_pretrained(model.path_of('/hf/'), trust_remote_code=llm.trust_remote_code),
_revision=model.info.metadata.get('_revision'),
)
return model
except Exception as err:
raise OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
def load_model(llm, *decls, **attrs):
device = 'cuda' if llm._has_gpus else 'cpu'
if llm.quantise:
compute_type = llm.quantise
elif llm.__llm_dtype__ == 'half':
compute_type = 'float16'
elif llm.__llm_dtype__ == 'float':
compute_type = 'float32'
else:
compute_type = llm.__llm_dtype__
return _get_class(llm)(llm.bentomodel.path, device=device, compute_type=compute_type)

View File

@@ -5,9 +5,5 @@ def import_model(llm, *decls, trust_remote_code=True, **attrs):
raise NotImplementedError('Currently work in progress.')
def get(llm):
raise NotImplementedError('Currently work in progress.')
def load_model(llm, *decls, **attrs):
raise NotImplementedError('Currently work in progress.')

View File

@@ -1,84 +1,153 @@
from __future__ import annotations
import importlib, logging
import orjson, torch, transformers, bentoml, openllm
import functools, importlib, logging, orjson, torch, transformers, openllm, attr
from huggingface_hub import snapshot_download
from openllm_core.exceptions import OpenLLMException
from openllm_core.utils import first_not_none, is_autogptq_available, is_flash_attn_2_available
from openllm_core.utils import is_autogptq_available, is_flash_attn_2_available
from ._helpers import get_tokenizer, infer_autoclass_from_llm, process_config
from .weights import HfIgnore
from .._helpers import patch_correct_tag, save_model
from .._helpers import save_model, get_hash
logger = logging.getLogger(__name__)
__all__ = ['import_model', 'get', 'load_model']
__all__ = ['import_model', 'load_model']
_object_setattr = object.__setattr__
TOKENIZER_ATTRS = {'padding_side': 'left', 'truncation_side': 'left'}
def import_model(llm, *decls, trust_remote_code, **attrs):
(_base_decls, _base_attrs), tokenizer_attrs = llm.llm_parameters
decls = (*_base_decls, *decls)
@functools.lru_cache(maxsize=1)
def has_gpus() -> bool:
try:
from cuda import cuda
err, *_ = cuda.cuInit(0)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to initialise CUDA runtime binding.')
err, _ = cuda.cuDeviceGetCount()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to get CUDA device count.')
return True
except (ImportError, RuntimeError):
return False
_TORCH_DTYPE_MAPPING = {
'half': torch.float16,
'float16': torch.float16, #
'float': torch.float32,
'float32': torch.float32, #
'bfloat16': torch.bfloat16,
}
def _torch_dtype(dtype: str, model_id: str, trust_remote_code: bool) -> 'torch.dtype':
hf_config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
config_dtype = getattr(hf_config, 'torch_dtype', None)
if config_dtype is None:
config_dtype = torch.float32
if dtype == 'auto':
if config_dtype == torch.float32:
torch_dtype = torch.float16
else:
torch_dtype = config_dtype
else:
torch_dtype = _TORCH_DTYPE_MAPPING.get(dtype, None)
if torch_dtype is None:
raise OpenLLMException(f'dtype not yet supported: {dtype}')
if not torch.cuda.is_available() and torch_dtype != torch.float32:
torch_dtype = torch.float32
return torch_dtype
def import_model(
*decls,
_model_id=None,
_bentomodel_tag=None,
_backend=None,
_local=False,
_quantization_config=None,
_quantize=None,
_dtype='auto',
_serialisation='safetensors',
trust_remote_code,
**attrs,
):
_base_attrs = {
'device_map': 'auto' if has_gpus() else None,
'safe_serialization': _serialisation == 'safetensors',
'torch_dtype': _torch_dtype(_dtype, _model_id, trust_remote_code),
}
attrs = {**_base_attrs, **attrs}
if llm._local:
logger.warning('Given model is a local model, OpenLLM will load model into memory for serialisation.')
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
patch_correct_tag(llm, config)
safe_serialisation = first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors')
if llm.quantise != 'gptq':
attrs['use_safetensors'] = safe_serialisation
model = None
tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
config, hub_attrs, attrs = process_config(_model_id, trust_remote_code, **attrs)
_revision = get_hash(config) if not _local else None
if _revision:
_bentomodel_tag = attr.evolve(_bentomodel_tag, version=_revision)
model, tokenizer = (
None,
get_tokenizer(_model_id, trust_remote_code=trust_remote_code, **hub_attrs, **TOKENIZER_ATTRS),
)
with save_model(
llm, config, safe_serialisation, trust_remote_code, 'transformers', [importlib.import_module(tokenizer.__module__)]
) as save_metadata:
bentomodel, imported_modules = save_metadata
_bentomodel_tag,
config,
_serialisation,
trust_remote_code,
'transformers',
[importlib.import_module(tokenizer.__module__)],
_model_id,
_quantize,
_backend,
_local,
_dtype,
) as bentomodel:
tokenizer.save_pretrained(bentomodel.path)
if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}):
attrs['quantization_config'] = llm.quantization_config
if llm.quantise == 'gptq' and llm.__llm_backend__ == 'pt':
if _quantization_config or (_quantize and _quantize not in {'squeezellm', 'awq'}):
attrs['quantization_config'] = _quantization_config
if _quantize == 'gptq' and _backend == 'pt':
from optimum.gptq.constants import GPTQ_CONFIG
with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f:
f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode())
if llm._local: # possible local path
model = infer_autoclass_from_llm(llm, config).from_pretrained(
llm.model_id, *decls, local_files_only=True, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
)
if _local:
try:
model = transformers.AutoModelForCausalLM.from_pretrained(
_model_id,
*decls,
local_files_only=True,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs,
)
except Exception:
try:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
_model_id,
*decls,
local_files_only=True,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs,
)
except Exception as err:
raise OpenLLMException(f'Failed to load model from {_model_id}') from err
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='2GB', safe_serialization=safe_serialisation)
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], bentomodel.imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='2GB', safe_serialization=_serialisation == 'safetensors')
del model
if torch.cuda.is_available():
torch.cuda.empty_cache()
else:
# we will clone the all tings into the bentomodel path without loading model into memory
snapshot_download(
llm.model_id,
local_dir=bentomodel.path, #
_model_id,
local_dir=bentomodel.path,
local_dir_use_symlinks=False,
ignore_patterns=HfIgnore.ignore_patterns(llm), #
ignore_patterns=HfIgnore.ignore_patterns(_backend, _model_id),
)
return bentomodel
def get(llm):
try:
model = bentoml.models.get(llm.tag)
backend = model.info.labels['backend']
if backend != llm.__llm_backend__:
raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
patch_correct_tag(
llm,
transformers.AutoConfig.from_pretrained(model.path, trust_remote_code=llm.trust_remote_code),
_revision=model.info.metadata.get('_revision'),
)
return model
except Exception as err:
raise OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
def check_unintialised_params(model):
unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
if len(unintialized) > 0:
@@ -124,7 +193,9 @@ def load_model(llm, *decls, **attrs):
)
except Exception as err:
logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
model = auto_class.from_pretrained(llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs)
model = auto_class.from_pretrained(
llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs
)
else:
try:
model = auto_class.from_pretrained(
@@ -139,7 +210,25 @@ def load_model(llm, *decls, **attrs):
except Exception as err:
logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
model = auto_class.from_pretrained(
llm.bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **attrs
llm.bentomodel.path,
*decls,
config=config,
trust_remote_code=llm.trust_remote_code,
device_map=device_map,
**attrs,
)
check_unintialised_params(model)
# If OOM, then it is probably you don't have enough VRAM to run this model.
loaded_in_kbit = (
getattr(model, 'is_loaded_in_8bit', False)
or getattr(model, 'is_loaded_in_4bit', False)
or getattr(model, 'is_quantized', False)
)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
model = model.to('cuda')
except Exception as err:
raise OpenLLMException(f'Failed to load model into GPU: {err}.\n') from err
return model

View File

@@ -6,7 +6,9 @@ logger = logging.getLogger(__name__)
def get_tokenizer(model_id_or_path, trust_remote_code, **attrs):
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code, **attrs)
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id_or_path, trust_remote_code=trust_remote_code, **attrs
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer

View File

@@ -6,7 +6,6 @@ from openllm_core.utils import resolve_filepath, validate_is_path
if t.TYPE_CHECKING:
from huggingface_hub.hf_api import ModelInfo as HfModelInfo
import openllm
__global_inst__ = None
__cached_id__: dict[str, HfModelInfo] = dict()
@@ -49,16 +48,16 @@ class HfIgnore:
gguf = '*.gguf'
@classmethod
def ignore_patterns(cls, llm: openllm.LLM[t.Any, t.Any]) -> list[str]:
if llm.__llm_backend__ in {'vllm', 'pt'}:
def ignore_patterns(cls, backend, model_id) -> list[str]:
if backend in {'vllm', 'pt'}:
base = [cls.tf, cls.flax, cls.gguf]
if has_safetensors_weights(llm.model_id):
if has_safetensors_weights(model_id):
base.extend([cls.pt, '*.pt'])
elif has_pt_weights(llm.model_id):
elif has_pt_weights(model_id):
base.extend([cls.safetensors, cls.pt])
else:
base.append(cls.safetensors)
elif llm.__llm_backend__ == 'ggml':
elif backend == 'ggml':
base = [cls.tf, cls.flax, cls.pt, cls.safetensors]
else:
raise ValueError('Unknown backend (should never happen at all.)')

View File

@@ -0,0 +1,42 @@
import openllm, traceback
from openllm_core.utils import is_vllm_available
from ..transformers import import_model
__all__ = ['import_model', 'load_model']
def load_model(llm, *decls, **attrs):
if not is_vllm_available():
raise RuntimeError(
"'vllm' is required to use with backend 'vllm'. Install it with 'pip install \"openllm[vllm]\"'"
)
import vllm, torch
num_gpus, dev = 1, openllm.utils.device_count()
if dev >= 2:
num_gpus = min(dev // 2 * 2, dev)
quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
dtype = (
torch.float16 if quantise == 'gptq' else llm._torch_dtype
) # NOTE: quantise GPTQ doesn't support bfloat16 yet.
try:
return vllm.AsyncLLMEngine.from_engine_args(
vllm.AsyncEngineArgs(
worker_use_ray=False,
engine_use_ray=False,
tokenizer_mode='auto',
tensor_parallel_size=num_gpus,
model=llm.bentomodel.path,
tokenizer=llm.bentomodel.path,
trust_remote_code=llm.trust_remote_code,
dtype=dtype,
max_model_len=llm._max_model_len,
gpu_memory_utilization=llm._gpu_memory_utilization,
quantization=quantise,
)
)
except Exception as err:
traceback.print_exc()
raise openllm.exceptions.OpenLLMException(
f'Failed to initialise vLLMEngine due to the following error:\n{err}'
) from err

View File

@@ -1,15 +1,12 @@
import functools, importlib.metadata, openllm_core
__all__ = ['generate_labels', 'available_devices', 'device_count']
__all__ = ['available_devices', 'device_count', 'generate_labels']
def generate_labels(llm):
def generate_labels(serialisation):
return {
'backend': llm.__llm_backend__,
'framework': 'openllm',
'model_name': llm.config['model_name'], #
'architecture': llm.config['architecture'],
'serialisation': llm._serialisation, #
'serialisation': serialisation,
**{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
}

View File

@@ -13,6 +13,7 @@ from openllm_core.utils import (
LazyModule as LazyModule,
ReprMixin as ReprMixin,
VersionInfo as VersionInfo,
correct_closure as correct_closure,
analytics as analytics,
calc_dir_size as calc_dir_size,
check_bool_env as check_bool_env,
@@ -34,7 +35,6 @@ from openllm_core.utils import (
is_autogptq_available as is_autogptq_available,
is_bentoml_available as is_bentoml_available,
is_bitsandbytes_available as is_bitsandbytes_available,
is_ctranslate_available as is_ctranslate_available,
is_flash_attn_2_available as is_flash_attn_2_available,
is_grpc_available as is_grpc_available,
is_jupyter_available as is_jupyter_available,
@@ -49,15 +49,15 @@ from openllm_core.utils import (
resolve_filepath as resolve_filepath,
resolve_user_filepath as resolve_user_filepath,
serde as serde,
pkg as pkg,
set_debug_mode as set_debug_mode,
set_disable_warnings as set_disable_warnings,
set_quiet_mode as set_quiet_mode,
validate_is_path as validate_is_path,
)
from openllm_core.utils.serde import converter as converter
from ._llm import LLM
from openllm_core._typing_compat import LiteralSerialisation as _LiteralSerialisation
def available_devices() -> Tuple[str, ...]: ...
def device_count() -> int: ...
def generate_labels(llm: LLM[Any, Any]) -> Dict[str, Any]: ...
def generate_labels(serialisation: _LiteralSerialisation) -> Dict[str, Any]: ...