revert: "ci: pre-commit autoupdate [pre-commit.ci] (#931)"

This reverts commit 7b00c84c2a.
This commit is contained in:
Aaron
2024-03-15 03:47:23 -04:00
parent 7b00c84c2a
commit e3392476be
69 changed files with 368 additions and 1300 deletions

View File

@@ -9,16 +9,10 @@ else:
# configuration for bitsandbytes before import
_os.environ['BITSANDBYTES_NOWELCOME'] = _os.environ.get('BITSANDBYTES_NOWELCOME', '1')
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization'
)
_warnings.filterwarnings(
'ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization'
)
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
_warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
_warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
_warnings.filterwarnings(
'ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated'
)
_warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
__lazy = utils.LazyModule( # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__name__,

View File

@@ -11,48 +11,14 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
"""
# update-config-stubs.py: import stubs start
from openlm_core.config import (
CONFIG_MAPPING as CONFIG_MAPPING,
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
AutoConfig as AutoConfig,
BaichuanConfig as BaichuanConfig,
ChatGLMConfig as ChatGLMConfig,
DollyV2Config as DollyV2Config,
FalconConfig as FalconConfig,
FlanT5Config as FlanT5Config,
GPTNeoXConfig as GPTNeoXConfig,
LlamaConfig as LlamaConfig,
MistralConfig as MistralConfig,
MixtralConfig as MixtralConfig,
MPTConfig as MPTConfig,
OPTConfig as OPTConfig,
PhiConfig as PhiConfig,
QwenConfig as QwenConfig,
StableLMConfig as StableLMConfig,
StarCoderConfig as StarCoderConfig,
YiConfig as YiConfig,
)
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
# update-config-stubs.py: import stubs stop
from openllm_cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start
from openllm_core._configuration import (
GenerationConfig as GenerationConfig,
LLMConfig as LLMConfig,
SamplingParams as SamplingParams,
)
from openllm_core._schemas import (
GenerationInput as GenerationInput,
GenerationOutput as GenerationOutput,
MetadataOutput as MetadataOutput,
)
from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput
from . import (
bundle as bundle,
client as client,
exceptions as exceptions,
serialisation as serialisation,
utils as utils,
)
from . import bundle as bundle, client as client, exceptions as exceptions, serialisation as serialisation, utils as utils
from ._deprecated import Runner as Runner
from ._llm import LLM as LLM
from ._quantisation import infer_quantisation_config as infer_quantisation_config

View File

@@ -19,9 +19,7 @@ def Runner(
if llm_config is None:
llm_config = openllm.AutoConfig.for_model(model_name)
if not ensure_available:
logger.warning(
"'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
)
logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
warnings.warn(
f"""\
@@ -42,13 +40,8 @@ def Runner(
attrs.update({
'model_id': model_id,
'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
'serialisation': getenv(
'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
),
'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
})
return openllm.LLM(
backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'),
llm_config=llm_config,
embedded=init_local,
**attrs,
backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs
).runner

View File

@@ -47,9 +47,7 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]
@attr.define(slots=False, repr=False, init=False)
class LLM(t.Generic[M, T]):
async def generate(
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
):
async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
if adapter_name is not None and self.__llm_backend__ != 'pt':
raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
config = self.config.model_construct_env(**attrs)
@@ -64,15 +62,10 @@ class LLM(t.Generic[M, T]):
raise RuntimeError('No result is returned.')
return final_result.with_options(
prompt=prompt,
outputs=[
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
for output in final_result.outputs
],
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs],
)
async def generate_iterator(
self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
):
async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
if adapter_name is not None and self.__llm_backend__ != 'pt':
@@ -137,9 +130,7 @@ class LLM(t.Generic[M, T]):
# The below are mainly for internal implementation that you don't have to worry about.
_model_id: str
_revision: t.Optional[str] #
_quantization_config: t.Optional[
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
]
_quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
_quantise: t.Optional[LiteralQuantise]
_model_decls: t.Tuple[t.Any, ...]
__model_attrs: t.Dict[str, t.Any] #
@@ -155,9 +146,7 @@ class LLM(t.Generic[M, T]):
__llm_torch_dtype__: 'torch.dtype' = None
__llm_config__: t.Optional[LLMConfig] = None
__llm_backend__: LiteralBackend = None
__llm_quantization_config__: t.Optional[
t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
] = None
__llm_quantization_config__: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]] = None
__llm_runner__: t.Optional[Runner[M, T]] = None
__llm_model__: t.Optional[M] = None
__llm_tokenizer__: t.Optional[T] = None
@@ -188,9 +177,7 @@ class LLM(t.Generic[M, T]):
torch_dtype = attrs.pop('torch_dtype', None) # backward compatible
if torch_dtype is not None:
warnings.warn(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
DeprecationWarning,
stacklevel=3,
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3
)
dtype = torch_dtype
_local = False
@@ -246,19 +233,13 @@ class LLM(t.Generic[M, T]):
class _Quantise:
@staticmethod
def pt(llm: LLM, quantise=None):
return quantise
def pt(llm: LLM, quantise=None): return quantise
@staticmethod
def vllm(llm: LLM, quantise=None):
return quantise
def vllm(llm: LLM, quantise=None): return quantise
@staticmethod
def ctranslate(llm: LLM, quantise=None):
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
if quantise == 'int8':
quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
return quantise
@apply(lambda val: tuple(str.lower(i) if i else i for i in val))
@@ -266,15 +247,10 @@ class LLM(t.Generic[M, T]):
model_id, *maybe_revision = model_id.rsplit(':')
if len(maybe_revision) > 0:
if model_version is not None:
logger.warning(
"revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
)
logger.warning("revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
model_version = maybe_revision[0]
if validate_is_path(model_id):
model_id, model_version = (
resolve_filepath(model_id),
first_not_none(model_version, default=generate_hash_from_file(model_id)),
)
model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
return f'{backend}-{normalise_model_name(model_id)}', model_version
@functools.cached_property
@@ -283,11 +259,9 @@ class LLM(t.Generic[M, T]):
from cuda import cuda
err, *_ = cuda.cuInit(0)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to initialise CUDA runtime binding.')
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
err, _ = cuda.cuDeviceGetCount()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Failed to get CUDA device count.')
if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to get CUDA device count.')
return True
except (ImportError, RuntimeError):
return False
@@ -299,9 +273,7 @@ class LLM(t.Generic[M, T]):
_map = _torch_dtype_mapping()
if not isinstance(self.__llm_torch_dtype__, torch.dtype):
try:
hf_config = transformers.AutoConfig.from_pretrained(
self.bentomodel.path, trust_remote_code=self.trust_remote_code
)
hf_config = transformers.AutoConfig.from_pretrained(self.bentomodel.path, trust_remote_code=self.trust_remote_code)
except OpenLLMException:
hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
config_dtype = getattr(hf_config, 'torch_dtype', None)
@@ -332,9 +304,7 @@ class LLM(t.Generic[M, T]):
return {**self.import_kwargs[1], **self.__tokenizer_attrs}
def _cascade_backend(self) -> LiteralBackend:
logger.warning(
'It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.'
)
logger.warning('It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.')
if self._has_gpus:
if is_vllm_available():
return 'vllm'
@@ -369,10 +339,7 @@ class LLM(t.Generic[M, T]):
@property
def import_kwargs(self):
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
'padding_side': 'left',
'truncation_side': 'left',
}
return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {'padding_side': 'left', 'truncation_side': 'left'}
@property
def trust_remote_code(self):
@@ -405,9 +372,7 @@ class LLM(t.Generic[M, T]):
if self._quantization_config is not None:
self.__llm_quantization_config__ = self._quantization_config
elif self._quantise is not None:
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
self, self._quantise, **self._model_attrs
)
self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self, self._quantise, **self._model_attrs)
else:
raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
return self.__llm_quantization_config__
@@ -462,11 +427,7 @@ class LLM(t.Generic[M, T]):
model = get_peft_model(
prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
self.config['fine_tune_strategies']
.get(adapter_type, self.config.make_fine_tune_config(adapter_type))
.train()
.with_config(**attrs)
.build(),
self.config['fine_tune_strategies'].get(adapter_type, self.config.make_fine_tune_config(adapter_type)).train().with_config(**attrs).build(),
)
if DEBUG:
model.print_trainable_parameters()
@@ -486,10 +447,7 @@ class LLM(t.Generic[M, T]):
if self.__llm_adapter_map__ is None:
_map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
for adapter_type, adapter_tuple in self._adapter_map.items():
base = first_not_none(
self.config['fine_tune_strategies'].get(adapter_type),
default=self.config.make_fine_tune_config(adapter_type),
)
base = first_not_none(self.config['fine_tune_strategies'].get(adapter_type), default=self.config.make_fine_tune_config(adapter_type))
for adapter in adapter_tuple:
_map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
self.__llm_adapter_map__ = _map
@@ -504,9 +462,7 @@ class LLM(t.Generic[M, T]):
import torch
loaded_in_kbit = (
getattr(model, 'is_loaded_in_8bit', False)
or getattr(model, 'is_loaded_in_4bit', False)
or getattr(model, 'is_quantized', False)
getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
@@ -528,9 +484,7 @@ class LLM(t.Generic[M, T]):
if self.__llm_config__ is None:
if self.__llm_backend__ == 'ctranslate':
try:
config = transformers.AutoConfig.from_pretrained(
self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
)
config = transformers.AutoConfig.from_pretrained(self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code)
except OpenLLMException:
config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
for architecture in config.architectures:
@@ -563,18 +517,12 @@ def _torch_dtype_mapping() -> dict[str, torch.dtype]:
def normalise_model_name(name: str) -> str:
return (
os.path.basename(resolve_filepath(name))
if validate_is_path(name)
else inflection.dasherize(name.replace('/', '--'))
)
return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/', '--'))
def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
if not is_peft_available():
raise RuntimeError(
"LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
)
raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
from huggingface_hub import hf_hub_download
resolved: AdapterMap = {}

View File

@@ -8,16 +8,7 @@ from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2Seq
from bentoml import Model, Tag
from openllm_core import LLMConfig
from openllm_core._schemas import GenerationOutput
from openllm_core._typing_compat import (
AdapterMap,
AdapterType,
LiteralBackend,
LiteralDtype,
LiteralQuantise,
LiteralSerialisation,
M,
T,
)
from openllm_core._typing_compat import AdapterMap, AdapterType, LiteralBackend, LiteralDtype, LiteralQuantise, LiteralSerialisation, M, T
from ._quantisation import QuantizationConfig
from ._runners import Runner
@@ -121,9 +112,7 @@ class LLM(Generic[M, T]):
def runner(self) -> Runner[M, T]: ...
@property
def adapter_map(self) -> ResolvedAdapterMap: ...
def prepare(
self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
) -> Tuple[InjectedModel, T]: ...
def prepare(self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any) -> Tuple[InjectedModel, T]: ...
async def generate(
self,
prompt: Optional[str],

View File

@@ -83,25 +83,19 @@ def infer_quantisation_config(llm, quantise, **attrs):
# NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
if not is_bitsandbytes_available():
raise RuntimeError(
'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
)
raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
if quantise == 'int8':
quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
quantisation_config = create_int4_config()
elif quantise == 'gptq':
if not is_autogptq_available():
raise MissingDependencyError(
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
)
raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
else:
quantisation_config = create_gptq_config()
elif quantise == 'awq':
if not is_autoawq_available():
raise MissingDependencyError(
"AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
)
raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
else:
quantisation_config = create_awq_config()
else:

View File

@@ -9,18 +9,10 @@ from ._llm import LLM
QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
) -> tuple[GPTQConfig, Dict[str, Any]]: ...
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any) -> tuple[GPTQConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
) -> tuple[AwqConfig, Dict[str, Any]]: ...
def infer_quantisation_config(self: LLM[M, T], quantise: Literal['awq'], **attrs: Any) -> tuple[AwqConfig, Dict[str, Any]]: ...
@overload
def infer_quantisation_config(
self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
def infer_quantisation_config(self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any) -> tuple[QuantizationConfig, Dict[str, Any]]: ...

View File

@@ -46,10 +46,7 @@ def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
(
'runner_methods',
{
method.name: {
'batchable': method.config.batchable,
'batch_dim': method.config.batch_dim if method.config.batchable else None,
}
method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None}
for method in _.runner_methods
},
),
@@ -114,7 +111,6 @@ class CTranslateRunnable(bentoml.Runnable):
).model_dump_json()
yield bentoml.io.SSE(out).marshal()
@registry
class vLLMRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
@@ -130,9 +126,7 @@ class vLLMRunnable(bentoml.Runnable):
if dev >= 2:
num_gpus = min(dev // 2 * 2, dev)
quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
dtype = (
torch.float16 if quantise == 'gptq' else llm._torch_dtype
) # NOTE: quantise GPTQ doesn't support bfloat16 yet.
dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype # NOTE: quantise GPTQ doesn't support bfloat16 yet.
try:
self.model = vllm.AsyncLLMEngine.from_engine_args(
vllm.AsyncEngineArgs(
@@ -151,9 +145,7 @@ class vLLMRunnable(bentoml.Runnable):
)
except Exception as err:
traceback.print_exc()
raise openllm.exceptions.OpenLLMException(
f'Failed to initialise vLLMEngine due to the following error:\n{err}'
) from err
raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
@bentoml.Runnable.method(batchable=False)
async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
@@ -210,9 +202,7 @@ class PyTorchRunnable(bentoml.Runnable):
if config['logprobs']: # FIXME: logprobs is not supported
raise NotImplementedError('Logprobs is yet to be supported with encoder-decoder models.')
encoder_output = self.model.encoder(input_ids=torch.as_tensor([prompt_token_ids], device=self.device))[0]
start_ids = torch.as_tensor(
[[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device
)
start_ids = torch.as_tensor([[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device)
else:
start_ids = torch.as_tensor([prompt_token_ids], device=self.device)
@@ -240,9 +230,7 @@ class PyTorchRunnable(bentoml.Runnable):
)
logits = self.model.lm_head(out[0])
else:
out = self.model(
input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True
)
out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True)
logits = out.logits
past_key_values = out.past_key_values
if logits_processor:
@@ -286,12 +274,7 @@ class PyTorchRunnable(bentoml.Runnable):
tmp_output_ids, rfind_start = output_token_ids[input_len:], 0
# XXX: Move this to API server
text = self.tokenizer.decode(
tmp_output_ids,
skip_special_tokens=True,
spaces_between_special_tokens=False,
clean_up_tokenization_spaces=True,
)
text = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
if len(stop) > 0:
for it in stop:

View File

@@ -1,19 +1,4 @@
from typing import (
Any,
AsyncGenerator,
Dict,
Generic,
Iterable,
List,
Literal,
Optional,
Protocol,
Tuple,
Type,
TypeVar,
Union,
final,
)
from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Protocol, Tuple, Type, TypeVar, Union, final
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer
@@ -89,11 +74,7 @@ class Runner(Protocol[Mo, To]):
class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
@staticmethod
def async_stream(
prompt_token_ids: List[int],
request_id: str,
stop: Optional[Union[Iterable[str], str]] = ...,
adapter_name: Optional[str] = ...,
**attrs: Any,
prompt_token_ids: List[int], request_id: str, stop: Optional[Union[Iterable[str], str]] = ..., adapter_name: Optional[str] = ..., **attrs: Any
) -> AsyncGenerator[str, None]: ...
def __init__(

View File

@@ -18,20 +18,12 @@ svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[l
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
@svc.api(
route='/v1/generate',
input=JSON.from_sample(llm_model_class.examples()),
output=JSON.from_sample(openllm.GenerationOutput.examples()),
)
@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
@svc.api(
route='/v1/generate_stream',
input=JSON.from_sample(llm_model_class.examples()),
output=Text(content_type='text/event-stream'),
)
@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
yield f'data: {it.model_dump_json()}\n\n'
@@ -76,6 +68,4 @@ def helpers_messages_v1(message: MessagesConverterInput) -> str:
return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
openllm.mount_entrypoints(
svc, llm
) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.

View File

@@ -158,16 +158,12 @@ class _ResourceMixin:
elif isinstance(spec, list):
return [str(x) for x in spec]
else:
raise TypeError(
f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
)
raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
@staticmethod
def validate(cls, val: list[t.Any]) -> None:
if cls.resource_id == 'amd.com/gpu':
raise RuntimeError(
"AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
)
raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
if not all(isinstance(i, str) for i in val):
raise ValueError('Input list should be all string type.')
@@ -311,18 +307,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
worker_index,
assigned_resource_per_worker,
)
raise IndexError(
f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
)
assigned_gpu = gpus[
assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
]
raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
dev = ','.join(assigned_gpu)
else:
idx = worker_index // workers_per_resource
if idx >= len(gpus):
raise ValueError(
f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
)
raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
dev = str(gpus[idx])
return dev

View File

@@ -13,12 +13,7 @@ class CascadingResourceStrategy:
TODO: Support CloudTPUResource
"""
@classmethod
def get_worker_count(
cls,
runnable_class: Type[bentoml.Runnable],
resource_request: Optional[Dict[str, Any]],
workers_per_resource: float,
) -> int:
def get_worker_count(cls, runnable_class: Type[bentoml.Runnable], resource_request: Optional[Dict[str, Any]], workers_per_resource: float) -> int:
"""Return the number of workers to be used for the given runnable class.
Note that for all available GPU, the number of workers will always be 1.
@@ -40,7 +35,5 @@ class CascadingResourceStrategy:
worker_index: The index of the worker, start from 0.
"""
@staticmethod
def transpile_workers_to_cuda_envvar(
workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
) -> str:
def transpile_workers_to_cuda_envvar(workers_per_resource: Union[float, int], gpus: List[str], worker_index: int) -> str:
"""Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""

View File

@@ -38,12 +38,8 @@ def build_editable(path, package='openllm'):
def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
from . import RefResolver
openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == 'vllm' else 'openllm'
packages = [
'scipy',
'bentoml[tracing]>=1.1.11,<1.2',
f'{openllm_package}>={RefResolver.from_strategy("release").version}',
] # apparently bnb misses this one
openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == "vllm" else "openllm"
packages = ['scipy', 'bentoml[tracing]>=1.1.11,<1.2', f'{openllm_package}>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one
if adapter_map is not None:
packages += ['openllm[fine-tune]']
if extra_dependencies is not None:
@@ -61,18 +57,7 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N
def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, serialisation):
from openllm_cli.entrypoint import process_environ
environ = process_environ(
llm.config,
llm.config['timeout'],
1.0,
None,
True,
llm.model_id,
None,
llm._serialisation,
llm,
use_current_env=False,
)
environ = process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm, use_current_env=False)
# XXX: We need to quote this so that the envvar in container recognize as valid json
environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container
@@ -101,10 +86,7 @@ def create_bento(
'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id,
'bundler': 'openllm.bundle',
**{
f'{package.replace("-", "_")}_version': importlib.metadata.version(package)
for package in {'openllm', 'openllm-core', 'openllm-client'}
},
**{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
})
if adapter_map:
labels.update(adapter_map)

View File

@@ -13,10 +13,7 @@ from .._llm import LLM
def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
def construct_python_options(
llm: LLM[M, T],
llm_fs: FS,
extra_dependencies: Optional[Tuple[str, ...]] = ...,
adapter_map: Optional[Dict[str, str]] = ...,
llm: LLM[M, T], llm_fs: FS, extra_dependencies: Optional[Tuple[str, ...]] = ..., adapter_map: Optional[Dict[str, str]] = ...
) -> PythonOptions: ...
def construct_docker_options(
llm: LLM[M, T],

View File

@@ -11,7 +11,5 @@ def mount_entrypoints(svc, llm):
return svc
__lazy = LazyModule(
__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
)
__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__

View File

@@ -501,11 +501,7 @@ class OpenLLMSchemaGenerator(SchemaGenerator):
endpoints_info.extend(sub_endpoints)
elif not isinstance(route, Route) or not route.include_in_schema:
continue
elif (
inspect.isfunction(route.endpoint)
or inspect.ismethod(route.endpoint)
or isinstance(route.endpoint, functools.partial)
):
elif inspect.isfunction(route.endpoint) or inspect.ismethod(route.endpoint) or isinstance(route.endpoint, functools.partial):
endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
path = self._remove_converter(route.path)
for method in route.methods or ['GET']:
@@ -552,9 +548,7 @@ def get_generator(title, components=None, tags=None, inject=True):
def component_schema_generator(attr_cls, description=None):
schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
schema['description'] = first_not_none(
getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
)
schema['description'] = first_not_none(getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}')
for field in attr.fields(attr.resolve_types(attr_cls)):
attr_type = field.type
origin_type = t.get_origin(attr_type)
@@ -596,10 +590,7 @@ def component_schema_generator(attr_cls, description=None):
_SimpleSchema = types.new_class(
'_SimpleSchema',
(object,),
{},
lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
'_SimpleSchema', (object,), {}, lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it})
)

View File

@@ -17,13 +17,8 @@ class OpenLLMSchemaGenerator:
def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
def append_schemas(
svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
) -> Service: ...
def append_schemas(svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...) -> Service: ...
def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
def get_generator(
title: str,
components: Optional[List[Type[AttrsInstance]]] = ...,
tags: Optional[List[Dict[str, Any]]] = ...,
inject: bool = ...,
title: str, components: Optional[List[Type[AttrsInstance]]] = ..., tags: Optional[List[Dict[str, Any]]] = ..., inject: bool = ...
) -> OpenLLMSchemaGenerator: ...

View File

@@ -59,10 +59,7 @@ def error_response(status_code, message):
async def check_model(request, model):
if request.model is None or request.model == model:
return None
return error_response(
HTTPStatus.NOT_FOUND,
f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
)
return error_response(HTTPStatus.NOT_FOUND, f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.")
def mount_to_svc(svc, llm):
@@ -71,17 +68,13 @@ def mount_to_svc(svc, llm):
routes=[
Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
Route(
'/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
),
Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
],
)
mount_path = '/cohere'
svc.mount_asgi_app(app, path=mount_path)
return append_schemas(
svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
)
return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
@add_schema_definitions
@@ -140,18 +133,14 @@ async def cohere_generate(req, llm):
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
for output in final_result.outputs
]
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
)
return JSONResponse(
converter.unstructure(
Generations(
id=request_id,
generations=[
Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason)
for output in final_result.outputs
Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason) for output in final_result.outputs
],
)
),
@@ -258,9 +247,7 @@ async def cohere_chat(req, llm):
final_result = res
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)]
)
final_result = final_result.with_options(outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)])
num_prompt_tokens, num_response_tokens = len(final_result.prompt_token_ids), len(token_ids)
return JSONResponse(
converter.unstructure(

View File

@@ -14,8 +14,6 @@ from ..protocol.cohere import CohereChatRequest, CohereGenerateRequest
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
def jsonify_attr(obj: AttrsInstance) -> str: ...
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
async def check_model(
request: Union[CohereGenerateRequest, CohereChatRequest], model: str
) -> Optional[JSONResponse]: ...
async def check_model(request: Union[CohereGenerateRequest, CohereChatRequest], model: str) -> Optional[JSONResponse]: ...
async def cohere_generate(req: Request, llm: LLM[M, T]) -> Response: ...
async def cohere_chat(req: Request, llm: LLM[M, T]) -> Response: ...

View File

@@ -37,10 +37,7 @@ def mount_to_svc(svc, llm):
def error_response(status_code, message):
return JSONResponse(
converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
status_code=status_code.value,
)
return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
@add_schema_definitions
@@ -56,9 +53,7 @@ async def hf_agent(req, llm):
stop = request.parameters.pop('stop', [])
try:
result = await llm.generate(request.inputs, stop=stop, **request.parameters)
return JSONResponse(
converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
)
return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
except Exception as err:
logger.error('Error while generating: %s', err)
return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')

View File

@@ -61,11 +61,7 @@ def jsonify_attr(obj):
def error_response(status_code, message):
return JSONResponse(
{
'error': converter.unstructure(
ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
)
},
{'error': converter.unstructure(ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value)))},
status_code=status_code.value,
)
@@ -99,11 +95,7 @@ def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initi
logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
last_token_len = len(token)
if num_output_top_logprobs:
logprobs.top_logprobs.append(
{llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
if step_top_logprobs
else None
)
logprobs.top_logprobs.append({llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()} if step_top_logprobs else None)
return logprobs
@@ -114,14 +106,8 @@ def mount_to_svc(svc, llm):
app = Starlette(
debug=True,
routes=[
Route(
'/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
),
Route(
'/completions',
functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
methods=['POST'],
),
Route('/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']),
Route('/completions', functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm), methods=['POST']),
Route(
'/chat/completions',
functools.partial(
@@ -146,9 +132,7 @@ def mount_to_svc(svc, llm):
# GET /v1/models
@add_schema_definitions
def list_models(_, llm):
return JSONResponse(
converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
)
return JSONResponse(converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value)
# POST /v1/chat/completions
@@ -182,9 +166,7 @@ async def chat_completions(req, llm):
config = llm.config.compatible_options(request)
def get_role() -> str:
return (
request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
) # TODO: Support custom role here.
return request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant' # TODO: Support custom role here.
try:
result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
@@ -198,9 +180,7 @@ async def chat_completions(req, llm):
id=request_id,
created=created_time,
model=model_name,
choices=[
ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
],
choices=[ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)],
)
if usage is not None:
response.usage = usage
@@ -251,17 +231,12 @@ async def chat_completions(req, llm):
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
for output in final_result.outputs
]
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
)
role = get_role()
choices = [
ChatCompletionResponseChoice(
index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
)
ChatCompletionResponseChoice(index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason)
for output in final_result.outputs
]
if request.echo:
@@ -275,9 +250,7 @@ async def chat_completions(req, llm):
num_prompt_tokens = len(final_result.prompt_token_ids)
num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
response = ChatCompletionResponse(
id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
)
response = ChatCompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
except Exception as err:
traceback.print_exc()
@@ -369,13 +342,7 @@ async def completions(req, llm):
top_logprobs = res.prompt_logprobs
previous_echo[i] = True
if request.logprobs is not None:
logprobs = create_logprobs(
output.token_ids,
output.logprobs[previous_num_tokens[i] :],
request.logprobs,
len(previous_texts[i]),
llm=llm,
)
logprobs = create_logprobs(output.token_ids, output.logprobs[previous_num_tokens[i] :], request.logprobs, len(previous_texts[i]), llm=llm)
previous_num_tokens[i] += len(output.token_ids)
previous_texts[i] += output.text
yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
@@ -402,10 +369,7 @@ async def completions(req, llm):
if final_result is None:
return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
final_result = final_result.with_options(
outputs=[
output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
for output in final_result.outputs
]
outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
)
choices = []
@@ -428,9 +392,7 @@ async def completions(req, llm):
output_text = prompt_text + output_text
else:
output_text = prompt_text
choice_data = CompletionResponseChoice(
index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
)
choice_data = CompletionResponseChoice(index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason)
choices.append(choice_data)
num_prompt_tokens = len(final_result.prompt_token_ids)

View File

@@ -14,9 +14,7 @@ from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
def jsonify_attr(obj: AttrsInstance) -> str: ...
def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
async def check_model(
request: Union[CompletionRequest, ChatCompletionRequest], model: str
) -> Optional[JSONResponse]: ...
async def check_model(request: Union[CompletionRequest, ChatCompletionRequest], model: str) -> Optional[JSONResponse]: ...
def create_logprobs(
token_ids: List[int],
top_logprobs: List[Dict[int, float]], #

View File

@@ -30,9 +30,7 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
'For example: "bentoml.transformers.save_model(..., custom_objects={\'tokenizer\': tokenizer})"'
) from None
else:
tokenizer = transformers.AutoTokenizer.from_pretrained(
bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs
)
tokenizer = transformers.AutoTokenizer.from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
if tokenizer.pad_token_id is None:
if config.pad_token_id is not None:

View File

@@ -29,9 +29,7 @@ def patch_correct_tag(llm, config, _revision=None) -> None:
if _revision is None and llm.tag.version is not None:
_revision = llm.tag.version
if llm.tag.version is None:
_object_setattr(
llm, '_tag', attr.evolve(llm.tag, version=_revision)
) # HACK: This copies the correct revision into llm.tag
_object_setattr(llm, '_tag', attr.evolve(llm.tag, version=_revision)) # HACK: This copies the correct revision into llm.tag
if llm._revision is None:
_object_setattr(llm, '_revision', _revision) # HACK: This copies the correct revision into llm._model_version
@@ -47,9 +45,7 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
if trust_remote_code:
auto_map = getattr(config, 'auto_map', {})
if not auto_map:
raise RuntimeError(
f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}'
)
raise RuntimeError(f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}')
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if autoclass not in auto_map:
raise RuntimeError(
@@ -60,10 +56,7 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
metadata.update({
'_pretrained_class': architectures[0],
'_revision': get_hash(config) if not llm.local else llm.revision,
})
metadata.update({'_pretrained_class': architectures[0], '_revision': get_hash(config) if not llm.local else llm.revision})
return metadata
@@ -144,9 +137,7 @@ def save_model(
bentomodel.flush()
bentomodel.save(_model_store)
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(
module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024
)
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)
)
finally:
bentomodel.exit_cloudpickle_context(imported_modules)

View File

@@ -10,9 +10,7 @@ from openllm_core._typing_compat import M, T
from .._llm import LLM
def get_hash(config: transformers.PretrainedConfig) -> str: ...
def patch_correct_tag(
llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...
) -> None: ...
def patch_correct_tag(llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...) -> None: ...
@contextmanager
def save_model(
llm: LLM[M, T],

View File

@@ -12,9 +12,7 @@ from .._helpers import patch_correct_tag, save_model
from ..transformers._helpers import get_tokenizer, process_config
if not is_ctranslate_available():
raise RuntimeError(
"'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'"
)
raise RuntimeError("'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'")
import ctranslate2
from ctranslate2.converters.transformers import TransformersConverter
@@ -44,17 +42,11 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
patch_correct_tag(llm, config)
tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
with save_model(
llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]
) as save_metadata:
with save_model(llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]) as save_metadata:
bentomodel, _ = save_metadata
if llm._local:
shutil.copytree(
llm.model_id,
bentomodel.path,
symlinks=False,
ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'),
dirs_exist_ok=True,
llm.model_id, bentomodel.path, symlinks=False, ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'), dirs_exist_ok=True
)
else:
TransformersConverter(
@@ -74,9 +66,7 @@ def get(llm):
model = bentoml.models.get(llm.tag)
backend = model.info.labels['backend']
if backend != llm.__llm_backend__:
raise OpenLLMException(
f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
)
raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
patch_correct_tag(
llm,
transformers.AutoConfig.from_pretrained(model.path_of('/hf/'), trust_remote_code=llm.trust_remote_code),

View File

@@ -12,7 +12,7 @@ from .._helpers import patch_correct_tag, save_model
logger = logging.getLogger(__name__)
__all__ = ['get', 'import_model', 'load_model']
__all__ = ['import_model', 'get', 'load_model']
_object_setattr = object.__setattr__
@@ -44,13 +44,7 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode())
if llm._local: # possible local path
model = infer_autoclass_from_llm(llm, config).from_pretrained(
llm.model_id,
*decls,
local_files_only=True,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs,
llm.model_id, *decls, local_files_only=True, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
@@ -74,9 +68,7 @@ def get(llm):
model = bentoml.models.get(llm.tag)
backend = model.info.labels['backend']
if backend != llm.__llm_backend__:
raise OpenLLMException(
f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
)
raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
patch_correct_tag(
llm,
transformers.AutoConfig.from_pretrained(model.path, trust_remote_code=llm.trust_remote_code),
@@ -132,9 +124,7 @@ def load_model(llm, *decls, **attrs):
)
except Exception as err:
logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
model = auto_class.from_pretrained(
llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs
)
model = auto_class.from_pretrained(llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs)
else:
try:
model = auto_class.from_pretrained(
@@ -149,12 +139,7 @@ def load_model(llm, *decls, **attrs):
except Exception as err:
logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
model = auto_class.from_pretrained(
llm.bentomodel.path,
*decls,
config=config,
trust_remote_code=llm.trust_remote_code,
device_map=device_map,
**attrs,
llm.bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **attrs
)
check_unintialised_params(model)
return model

View File

@@ -6,9 +6,7 @@ logger = logging.getLogger(__name__)
def get_tokenizer(model_id_or_path, trust_remote_code, **attrs):
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id_or_path, trust_remote_code=trust_remote_code, **attrs
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code, **attrs)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer

View File

@@ -1,6 +1,6 @@
import functools, importlib.metadata, openllm_core
__all__ = ['available_devices', 'device_count', 'generate_labels']
__all__ = ['generate_labels', 'available_devices', 'device_count']
def generate_labels(llm):

View File

@@ -5,25 +5,12 @@ from bentoml_cli.utils import BentoMLCommandGroup
from click import shell_completion as sc
from openllm_core._configuration import LLMConfig
from openllm_core._typing_compat import (
Concatenate,
DictStrAny,
LiteralBackend,
LiteralSerialisation,
ParamSpec,
AnyCallable,
get_literal_args,
)
from openllm_core._typing_compat import Concatenate, DictStrAny, LiteralBackend, LiteralSerialisation, ParamSpec, AnyCallable, get_literal_args
from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath
class _OpenLLM_GenericInternalConfig(LLMConfig):
__config__ = {
'name_type': 'lowercase',
'default_id': 'openllm/generic',
'model_ids': ['openllm/generic'],
'architecture': 'PreTrainedModel',
}
__config__ = {'name_type': 'lowercase', 'default_id': 'openllm/generic', 'model_ids': ['openllm/generic'], 'architecture': 'PreTrainedModel'}
class GenerationConfig:
top_k: int = 15
@@ -50,20 +37,11 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [
sc.CompletionItem(inflection.dasherize(it), help='Model')
for it in openllm.CONFIG_MAPPING
if it.startswith(incomplete)
]
return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
def parse_config_options(
config: LLMConfig,
server_timeout: int,
workers_per_resource: float,
device: t.Tuple[str, ...] | None,
cors: bool,
environ: DictStrAny,
config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny
) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
@@ -78,21 +56,14 @@ def parse_config_options(
if device:
if len(device) > 1:
_bentoml_config_options_opts.extend([
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
for idx, dev in enumerate(device)
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)
])
else:
_bentoml_config_options_opts.append(
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
)
_bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
if cors:
_bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend([
'api_server.http.cors.enabled=true',
'api_server.http.cors.access_control_allow_origins="*"',
])
_bentoml_config_options_opts.extend([
f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
])
_bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
@@ -171,9 +142,7 @@ def start_decorator(fn: FC) -> FC:
return composed(fn)
def parse_device_callback(
_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
) -> t.Tuple[str, ...] | None:
def parse_device_callback(_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
if value is None:
return value
el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -192,19 +161,13 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
from bentoml_cli.cli import cli
group = cog.optgroup.group(
'Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]'
)
group = cog.optgroup.group('Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]')
def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
serve_command = cli.commands['serve']
# The first variable is the argument bento
# The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
serve_options = [
p
for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
if p.name not in _IGNORED_OPTIONS
]
serve_options = [p for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
for options in reversed(serve_options):
attrs = options.to_info_dict()
# we don't need param_type_name, since it should all be options
@@ -258,13 +221,7 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--cors/--no-cors',
show_default=True,
default=False,
envvar='OPENLLM_CORS',
show_envvar=True,
help='Enable CORS for the server.',
**attrs,
'--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs
)(f)
@@ -318,12 +275,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_argument(
'model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
required=required,
**attrs,
)(f)
return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -361,9 +313,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
)(f)
def workers_per_resource_option(
f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
) -> t.Callable[[FC], FC]:
def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--workers-per-resource',
default=None,
@@ -431,9 +381,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
float(value) # type: ignore[arg-type]
except ValueError:
raise click.BadParameter(
f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
ctx,
param,
f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param
) from None
else:
return value

View File

@@ -69,10 +69,7 @@ def _start(
if timeout:
args.extend(['--server-timeout', str(timeout)])
if workers_per_resource:
args.extend([
'--workers-per-resource',
str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
])
args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
args.extend(['--device', ','.join(device)])
if quantize:
@@ -80,11 +77,7 @@ def _start(
if cors:
args.append('--cors')
if adapter_map:
args.extend(
list(
itertools.chain.from_iterable([['--adapter-id', f"{k}{':' + v if v else ''}"] for k, v in adapter_map.items()])
)
)
args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
if additional_args:
args.extend(additional_args)
if __test__:
@@ -155,9 +148,7 @@ def _build(
'--machine',
'--quiet',
'--serialisation',
first_not_none(
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
),
first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
]
if quantize:
args.extend(['--quantize', quantize])
@@ -174,7 +165,7 @@ def _build(
if overwrite:
args.append('--overwrite')
if adapter_map:
args.extend([f"--adapter-id={k}{':' + v if v is not None else ''}" for k, v in adapter_map.items()])
args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
if model_version:
args.extend(['--model-version', model_version])
if bento_version:
@@ -274,4 +265,4 @@ start, build, import_model, list_models = (
codegen.gen_sdk(_import_model),
codegen.gen_sdk(_list_models),
)
__all__ = ['build', 'import_model', 'list_models', 'start']
__all__ = ['start', 'build', 'import_model', 'list_models']

View File

@@ -43,15 +43,7 @@ from openllm_core.utils import (
)
from . import termui
from ._factory import (
FC,
_AnyCallable,
machine_option,
model_name_argument,
parse_config_options,
start_decorator,
optimization_decorator,
)
from ._factory import FC, _AnyCallable, machine_option, model_name_argument, parse_config_options, start_decorator, optimization_decorator
if t.TYPE_CHECKING:
import torch
@@ -103,18 +95,12 @@ def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
)
if build:
logger.info(
"Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
)
logger.info("Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally.")
class Extensions(click.MultiCommand):
def list_commands(self, ctx: click.Context) -> list[str]:
return sorted([
filename[:-3]
for filename in os.listdir(_EXT_FOLDER)
if filename.endswith('.py') and not filename.startswith('__')
])
return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])
def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
try:
@@ -131,41 +117,19 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
def common_params(f: t.Callable[P, t.Any]) -> t.Callable[[FC], FC]:
# The following logics is similar to one of BentoMLCommandGroup
@cog.optgroup.group(name='Global options', help='Shared globals options for all OpenLLM CLI.') # type: ignore[misc]
@cog.optgroup.option('-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True)
@cog.optgroup.option(
'-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True
'--debug', '--verbose', 'debug', envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help='Print out debug logs.', show_envvar=True
)
@cog.optgroup.option(
'--debug',
'--verbose',
'debug',
envvar=DEBUG_ENV_VAR,
is_flag=True,
default=False,
help='Print out debug logs.',
show_envvar=True,
'--do-not-track', is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help='Do not send usage info', show_envvar=True
)
@cog.optgroup.option(
'--do-not-track',
is_flag=True,
default=False,
envvar=analytics.OPENLLM_DO_NOT_TRACK,
help='Do not send usage info',
show_envvar=True,
)
@cog.optgroup.option(
'--context',
'cloud_context',
envvar='BENTOCLOUD_CONTEXT',
type=click.STRING,
default=None,
help='BentoCloud context name.',
show_envvar=True,
'--context', 'cloud_context', envvar='BENTOCLOUD_CONTEXT', type=click.STRING, default=None, help='BentoCloud context name.', show_envvar=True
)
@click.pass_context
@functools.wraps(f)
def wrapper(
ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs
) -> t.Any:
def wrapper(ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs) -> t.Any:
ctx.obj = GlobalOptions(cloud_context=cloud_context)
if quiet:
set_quiet_mode(True)
@@ -179,9 +143,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
return wrapper
@staticmethod
def usage_tracking(
func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any
) -> t.Callable[Concatenate[bool, P], t.Any]:
def usage_tracking(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[Concatenate[bool, P], t.Any]:
command_name = attrs.get('name', func.__name__)
@functools.wraps(func)
@@ -240,9 +202,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
_memo = getattr(wrapped, '__click_params__', None)
if _memo is None:
raise ValueError('Click command not register correctly.')
_object_setattr(
wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS]
)
_object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS])
# NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
# NOTE: add aliases to a given commands if it is specified.
@@ -250,7 +210,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
if not cmd.name:
raise ValueError('name is required when aliases are available.')
self._commands[cmd.name] = aliases
self._aliases.update(dict.fromkeys(aliases, cmd.name))
self._aliases.update({alias: cmd.name for alias in aliases})
return cmd
return decorator
@@ -317,12 +277,7 @@ def cli() -> None:
"""
@cli.command(
context_settings=termui.CONTEXT_SETTINGS,
name='start',
aliases=['start-http'],
short_help='Start a LLMServer for any supported LLM.',
)
@cli.command(context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'], short_help='Start a LLMServer for any supported LLM.')
@click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
@click.option(
'--model-id',
@@ -375,9 +330,7 @@ def start_command(
```
"""
if backend == 'pt':
logger.warning(
'PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.'
)
logger.warning('PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.')
if model_id in openllm.CONFIG_MAPPING:
_model_name = model_id
if deprecated_model_id is not None:
@@ -395,17 +348,11 @@ def start_command(
from openllm.serialisation.transformers.weights import has_safetensors_weights
serialisation = first_not_none(
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
)
serialisation = first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
if serialisation == 'safetensors' and quantize is not None:
logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
logger.warning(
"Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
model_id,
serialisation,
)
logger.warning("Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.", model_id, serialisation)
logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
import torch
@@ -433,9 +380,7 @@ def start_command(
config, server_attrs = llm.config.model_validate_click(**attrs)
server_timeout = first_not_none(server_timeout, default=config['timeout'])
server_attrs.update({'working_dir': pkg.source_locations('openllm'), 'timeout': server_timeout})
development = server_attrs.pop(
'development'
) # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
development = server_attrs.pop('development') # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
server_attrs.setdefault('production', not development)
start_env = process_environ(
@@ -465,12 +410,8 @@ def start_command(
return config
def process_environ(
config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True
):
environ = parse_config_options(
config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {}
)
def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True):
environ = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {})
environ.update({
'OPENLLM_MODEL_ID': model_id,
'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
@@ -515,8 +456,7 @@ def build_bento_instruction(llm, model_id, serialisation, adapter_map):
cmd_name += f' --serialization {serialisation}'
if adapter_map is not None:
cmd_name += ' ' + ' '.join([
f'--adapter-id {s}'
for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
])
if not openllm.utils.get_quiet_mode():
termui.info(f"🚀Tip: run '{cmd_name}' to create a BentoLLM for '{model_id}'")
@@ -551,12 +491,8 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
if return_process:
return process
stop_event = threading.Event()
stdout, stderr = (
threading.Thread(target=handle, args=(process.stdout, stop_event)),
threading.Thread(target=handle, args=(process.stderr, stop_event)),
)
stdout.start()
stderr.start() # noqa: E702
stdout, stderr = threading.Thread(target=handle, args=(process.stdout, stop_event)), threading.Thread(target=handle, args=(process.stderr, stop_event))
stdout.start(); stderr.start() # noqa: E702
try:
process.wait()
@@ -571,12 +507,9 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
raise
finally:
stop_event.set()
stdout.join()
stderr.join() # noqa: E702
if process.poll() is not None:
process.kill()
stdout.join()
stderr.join() # noqa: E702
stdout.join(); stderr.join() # noqa: E702
if process.poll() is not None: process.kill()
stdout.join(); stderr.join() # noqa: E702
return process.returncode
@@ -664,10 +597,7 @@ def import_command(
backend=backend,
dtype=dtype,
serialisation=t.cast(
LiteralSerialisation,
first_not_none(
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
),
LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
),
)
backend_warning(llm.__llm_backend__)
@@ -726,21 +656,14 @@ class BuildBentoOutput(t.TypedDict):
metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
help='Deprecated. Use positional argument instead.',
)
@click.option(
'--bento-version',
type=str,
default=None,
help='Optional bento version for this BentoLLM. Default is the the model revision.',
)
@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@click.option(
'--enable-features',
multiple=True,
nargs=1,
metavar='FEATURE[,FEATURE]',
help='Enable additional features for building this LLM Bento. Available: {}'.format(
', '.join(OPTIONAL_DEPENDENCIES)
),
help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES)),
)
@optimization_decorator
@click.option(
@@ -751,12 +674,7 @@ class BuildBentoOutput(t.TypedDict):
help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
)
@click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
@click.option(
'--dockerfile-template',
default=None,
type=click.File(),
help='Optional custom dockerfile template to be used with this BentoLLM.',
)
@click.option('--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.')
@cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options') # type: ignore[misc]
@cog.optgroup.option(
'--containerize',
@@ -849,9 +767,7 @@ def build_command(
state = ItemState.NOT_FOUND
if backend == 'pt':
logger.warning(
"PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead."
)
logger.warning("PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead.")
llm = openllm.LLM(
model_id=model_id,
@@ -861,9 +777,7 @@ def build_command(
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
serialisation=first_not_none(
serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
),
serialisation=first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
_eager=False,
)
if llm.__llm_backend__ not in llm.config['backend']:
@@ -875,9 +789,7 @@ def build_command(
model = openllm.serialisation.import_model(llm, trust_remote_code=llm.trust_remote_code)
llm._tag = model.tag
os.environ.update(
**process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm)
)
os.environ.update(**process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm))
try:
assert llm.bentomodel # HACK: call it here to patch correct tag with revision and everything
@@ -944,11 +856,7 @@ def build_command(
def get_current_bentocloud_context() -> str | None:
try:
context = (
cloud_config.get_context(ctx.obj.cloud_context)
if ctx.obj.cloud_context
else cloud_config.get_current_context()
)
context = cloud_config.get_context(ctx.obj.cloud_context) if ctx.obj.cloud_context else cloud_config.get_current_context()
return context.name
except Exception:
return None
@@ -972,9 +880,7 @@ def build_command(
tag=str(bento_tag),
backend=llm.__llm_backend__,
instructions=[
DeploymentInstruction.from_content(
type='bentocloud', instr="☁️ Push to BentoCloud with 'bentoml push':\n $ {cmd}", cmd=push_cmd
),
DeploymentInstruction.from_content(type='bentocloud', instr="☁️ Push to BentoCloud with 'bentoml push':\n $ {cmd}", cmd=push_cmd),
DeploymentInstruction.from_content(
type='container',
instr="🐳 Container BentoLLM with 'bentoml containerize':\n $ {cmd}",
@@ -1000,9 +906,7 @@ def build_command(
termui.echo(f" * {instruction['content']}\n", nl=False)
if push:
BentoMLContainer.bentocloud_client.get().push_bento(
bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push
)
BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
elif containerize:
container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
try:
@@ -1042,8 +946,7 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
architecture=config.__openllm_architecture__,
example_id=random.choice(config.__openllm_model_ids__),
supported_backends=config.__openllm_backend__,
installation='pip install '
+ (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
installation='pip install ' + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
items=[
str(md.tag)
for md in bentoml.models.list()
@@ -1062,13 +965,7 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
@cli.command()
@model_name_argument(required=False)
@click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
@click.option(
'--include-bentos/--no-include-bentos',
is_flag=True,
hidden=True,
default=True,
help='Whether to also include pruning bentos.',
)
@click.option('--include-bentos/--no-include-bentos', is_flag=True, hidden=True, default=True, help='Whether to also include pruning bentos.')
@inject
@click.pass_context
def prune_command(
@@ -1085,32 +982,24 @@ def prune_command(
If a model type is passed, then only prune models for that given model type.
"""
available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
(m, model_store)
for m in bentoml.models.list()
if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
(m, model_store) for m in bentoml.models.list() if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
]
if model_name is not None:
available = [
(m, store)
for m, store in available
if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
(m, store) for m, store in available if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
] + [
(b, bento_store)
for b in bentoml.bentos.list()
if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)
]
else:
available += [
(b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels
]
available += [(b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels]
for store_item, store in available:
if yes:
delete_confirmed = True
else:
delete_confirmed = click.confirm(
f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?"
)
delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
if delete_confirmed:
store.delete(store_item.tag)
termui.warning(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.")
@@ -1157,17 +1046,8 @@ def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC]
@cli.command()
@shared_client_options
@click.option(
'--server-type',
type=click.Choice(['grpc', 'http']),
help='Server type',
default='http',
show_default=True,
hidden=True,
)
@click.option(
'--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.'
)
@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True, hidden=True)
@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
@click.argument('prompt', type=click.STRING)
@click.option(
'--sampling-params',

View File

@@ -21,9 +21,7 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
def cli(
ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
) -> str | None:
def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
"""Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
try:
bentomodel = _bento_store.get(bento)

View File

@@ -17,9 +17,7 @@ if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
@click.command(
'get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.'
)
@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject

View File

@@ -22,9 +22,7 @@ class PromptFormatter(string.Formatter):
raise ValueError('Positional arguments are not supported')
return super().vformat(format_string, args, kwargs)
def check_unused_args(
self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]
) -> None:
def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
extras = set(kwargs).difference(used_args)
if extras:
raise KeyError(f'Extra params passed: {extras}')
@@ -58,9 +56,7 @@ class PromptTemplate:
try:
return self.template.format(**prompt_variables)
except KeyError as e:
raise RuntimeError(
f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template."
) from None
raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template.") from None
@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@@ -128,21 +124,15 @@ def cli(
if prompt_template_file and chat_template_file:
ctx.fail('prompt-template-file and chat-template-file are mutually exclusive.')
acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(
inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys()
)
acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys())
if model_id in acceptable:
logger.warning(
'Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n'
)
logger.warning('Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n')
config = openllm.AutoConfig.for_model(model_id)
template = prompt_template_file.read() if prompt_template_file is not None else config.template
system_message = system_message or config.system_message
try:
formatted = (
PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
)
formatted = PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
except RuntimeError as err:
logger.debug('Exception caught while formatting prompt: %s', err)
ctx.fail(str(err))
@@ -159,21 +149,15 @@ def cli(
for architecture in config.architectures:
if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
system_message = (
openllm.AutoConfig.infer_class_from_name(
openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
)
openllm.AutoConfig.infer_class_from_name(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
.model_construct_env()
.system_message
)
break
else:
ctx.fail(
f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message'
)
ctx.fail(f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message')
messages = [{'role': 'system', 'content': system_message}, {'role': 'user', 'content': prompt}]
formatted = tokenizer.apply_chat_template(
messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False
)
formatted = tokenizer.apply_chat_template(messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False)
termui.echo(orjson.dumps({'prompt': formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
ctx.exit(0)

View File

@@ -33,17 +33,12 @@ def cli(model_name: str | None) -> DictStrAny:
}
if model_name is not None:
ids_in_local_store = {
k: [
i
for i in v
if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
]
k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
for k, v in ids_in_local_store.items()
}
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
local_models = {
k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val]
for k, val in ids_in_local_store.items()
k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()
}
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
return local_models

View File

@@ -32,14 +32,7 @@ def load_notebook_metadata() -> DictStrAny:
@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
@click.option(
'--port',
envvar='JUPYTER_PORT',
show_envvar=True,
show_default=True,
default=8888,
help='Default port for Jupyter server',
)
@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
@click.pass_context
def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
"""OpenLLM Playground.
@@ -60,9 +53,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
> This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
"""
if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
raise RuntimeError(
"Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
)
raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
metadata = load_notebook_metadata()
_temp_dir = False
if output_dir is None:
@@ -74,9 +65,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
for module in pkgutil.iter_modules(playground.__path__):
if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
logger.debug(
'Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module'
)
logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
continue
if not isinstance(module.module_finder, importlib.machinery.FileFinder):
continue

View File

@@ -25,14 +25,7 @@ class Level(enum.IntEnum):
@property
def color(self) -> str | None:
return {
Level.NOTSET: None,
Level.DEBUG: 'cyan',
Level.INFO: 'green',
Level.WARNING: 'yellow',
Level.ERROR: 'red',
Level.CRITICAL: 'red',
}[self]
return {Level.NOTSET: None, Level.DEBUG: 'cyan', Level.INFO: 'green', Level.WARNING: 'yellow', Level.ERROR: 'red', Level.CRITICAL: 'red'}[self]
@classmethod
def from_logging_level(cls, level: int) -> Level:
@@ -82,9 +75,5 @@ def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json:
COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
CONTEXT_SETTINGS: DictStrAny = {
'help_option_names': ['-h', '--help'],
'max_content_width': COLUMNS,
'token_normalize_func': inflection.underscore,
}
__all__ = ['COLUMNS', 'CONTEXT_SETTINGS', 'Level', 'critical', 'debug', 'echo', 'error', 'info', 'log', 'warning']
CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']