chore: ignore new lines split [skip ci]

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
aarnphm-ec2-dev
2023-09-01 17:00:49 +00:00
parent 608de0b667
commit 7d893e6cd2
70 changed files with 575 additions and 950 deletions

View File

@@ -129,9 +129,7 @@ else:
try:
if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_vllm_objects"] = [
name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)
]
_import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
_import_structure["models.baichuan"].extend(["VLLMBaichuan"])
_import_structure["models.llama"].extend(["VLLMLlama"])
@@ -157,9 +155,7 @@ else:
try:
if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_flax_objects"] = [
name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)
]
_import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
_import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
_import_structure["models.opt"].extend(["FlaxOPT"])
@@ -171,9 +167,7 @@ else:
try:
if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_tf_objects"] = [
name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)
]
_import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
_import_structure["models.flan_t5"].extend(["TFFlanT5"])
_import_structure["models.opt"].extend(["TFOPT"])
@@ -184,15 +178,7 @@ else:
from .models.opt import TFOPT as TFOPT
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__lazy = openllm_core.utils.LazyModule(__name__,
globals()["__file__"],
_import_structure,
extra_objects={
"COMPILED": COMPILED,
"__openllm_migration__": {
"LLMEmbeddings": "EmbeddingsOutput"
}
})
__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED, "__openllm_migration__": {"LLMEmbeddings": "EmbeddingsOutput"}})
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -99,10 +99,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
# _cached_LLMFunction_get and _ccached_LLMSerialisation_get
globs.update({f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
# llm_post_init implementation
lines: ListStr = [
f'_impl_{cls.__name__}_func=cls.llm_post_init',
_setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')
]
lines: ListStr = [f'_impl_{cls.__name__}_func=cls.llm_post_init', _setattr_class('llm_post_init', f'__wrapped_llm_post_init(_impl_{cls.__name__}_func)')]
serialisation_attr = {'import_model': import_model, 'load_model': load_model, 'load_tokenizer': load_tokenizer,}
for func, impl in serialisation_attr.items():
@@ -114,10 +111,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
# assign vLLM implementation
if cls.__llm_backend__ == 'vllm':
vllm_func = {
f'_vllm_{it}': fn
for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
}
vllm_func = {f'_vllm_{it}': fn for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))}
globs.update(vllm_func)
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
@@ -137,15 +131,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
return codegen.generate_function(cls,
'__assign_llm_attr',
lines,
args=('cls', *args),
globs=globs,
annotations={
'cls': 't.Type[LLM]',
'return': None
})
return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations={'cls': 't.Type[LLM]', 'return': None})
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
return generation_result[0]['outputs'][0]['text']

View File

@@ -25,8 +25,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
return bentoml.transformers.get(ids)
except bentoml.exceptions.NotFound:
model_signatures = {
k: ModelSignature(batchable=False) for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search',
'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
k: ModelSignature(batchable=False)
for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
}
with bentoml.models.create(ids,
module=MODULE_NAME,
@@ -34,8 +34,7 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='transformers'),
labels={
'runtime': 'pt',
'framework': 'openllm'
'runtime': 'pt', 'framework': 'openllm'
},
signatures=model_signatures) as bentomodel:
snapshot_download(_GENERIC_EMBEDDING_ID,

View File

@@ -14,8 +14,7 @@ LogitsProcessorList = transformers.LogitsProcessorList
StoppingCriteriaList = transformers.StoppingCriteriaList
class StopSequenceCriteria(transformers.StoppingCriteria):
def __init__(self, stop_sequences: str | list[str],
tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
self.stop_sequences, self.tokenizer = stop_sequences, tokenizer

View File

@@ -278,12 +278,20 @@ class LLM(LLMInterface[M, T], ReprMixin):
if t.TYPE_CHECKING: __name__: str
if t.TYPE_CHECKING and not MYPY:
def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag,
adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4',
'gptq']], serialisation_format: t.Literal['safetensors',
'legacy'], _local: bool, **attrs: t.Any) -> None:
def __attrs_init__(self,
config: LLMConfig,
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
model_id: str,
model_decls: TupleAny,
model_attrs: DictStrAny,
tokenizer_attrs: DictStrAny,
tag: bentoml.Tag,
adapters_mapping: t.Optional[AdaptersMapping],
model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
serialisation_format: t.Literal['safetensors', 'legacy'],
_local: bool,
**attrs: t.Any) -> None:
'''Generated __attrs_init__ for openllm.LLM.'''
config: LLMConfig
@@ -432,14 +440,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
_local = False
_model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
quantize = first_not_none(quantize,
t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])),
default=None)
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
# quantization setup
if quantization_config and quantize:
raise ValueError(
"'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
if quantization_config is None and quantize is not None:
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
if quantize == 'gptq': serialisation = 'safetensors'
@@ -465,9 +470,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
if _tag.version is None:
raise ValueError(f'Failed to resolve the correct model version for {cfg_cls.__openllm_start_name__}')
except Exception as err:
raise OpenLLMException(
f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}"
) from err
raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}") from err
return cls(*args,
model_id=_model_id,
@@ -518,9 +521,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
else:
from .serialisation.transformers._helpers import process_config
model_version = getattr(
process_config(model_id,
trust_remote_code=cls.config_class.__openllm_trust_remote_code__,
revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None)
process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default='main'))[0], '_commit_hash', None)
if model_version is None:
raise ValueError(f"Internal errors when parsing config for pretrained '{model_id}' ('commit_hash' not found)")
return f'{tag_name}:{model_version}'
@@ -529,10 +530,18 @@ class LLM(LLMInterface[M, T], ReprMixin):
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag, _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
_serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any,
def __init__(self,
*args: t.Any,
model_id: str,
llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
_model_version: str,
_serialisation_format: t.Literal['safetensors', 'legacy'],
_local: bool,
**attrs: t.Any,
):
'''Initialize the LLM with given pretrained model.
@@ -630,21 +639,27 @@ class LLM(LLMInterface[M, T], ReprMixin):
# parsing tokenizer and model kwargs, as the hierachy is param pass > default
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
# NOTE: Save the args and kwargs for latter load
self.__attrs_init__(llm_config, quantization_config, model_id, args, {
**model_kwds,
**normalized_model_kwds
}, {
**tokenizer_kwds,
**normalized_tokenizer_kwds
}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
self.__attrs_init__(llm_config,
quantization_config,
model_id,
args, {
**model_kwds, **normalized_model_kwds
}, {
**tokenizer_kwds, **normalized_tokenizer_kwds
},
_tag,
_adapters_mapping,
_model_version,
_quantize_method,
_serialisation_format,
_local)
self.llm_post_init()
def __setattr__(self, attr: str, value: t.Any) -> None:
if attr in _reserved_namespace:
raise ForbiddenAttributeError(
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.'
)
f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.')
super().__setattr__(attr, value)
@property
@@ -738,8 +753,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
model = self.load_model(*self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
if self.__llm_backend__ == 'pt' and is_torch_available():
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(
model, 'is_quantized', False)
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
model = model.to('cuda')
@@ -777,34 +791,22 @@ class LLM(LLMInterface[M, T], ReprMixin):
if name is None:
_converted_first_none = True
name = 'default'
peft_config = default_config.with_config(
**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
adapter_config=adapter.config,
inference_mode=inference_mode,
llm_config_class=self.config_class).to_peft_config()
peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(
adapter_type=t.cast('PeftType', _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class).to_peft_config()
adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map
return adapter_map
def prepare_for_training(self,
adapter_type: AdapterType = 'lora',
use_gradient_checkpointing: bool = True,
**attrs: t.Any) -> tuple[peft.PeftModel, T]:
def prepare_for_training(self, adapter_type: AdapterType = 'lora', use_gradient_checkpointing: bool = True, **attrs: t.Any) -> tuple[peft.PeftModel, T]:
from peft import prepare_model_for_kbit_training
peft_config = self.config['fine_tune_strategies'].get(
adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type),
llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config()
wrapped_peft = peft.get_peft_model(
prepare_model_for_kbit_training( # type: ignore[no-untyped-call]
self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config)
peft_config = self.config['fine_tune_strategies'].get(adapter_type, FineTuneConfig(adapter_type=t.cast('PeftType', adapter_type),
llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config()
wrapped_peft = peft.get_peft_model(prepare_model_for_kbit_training( # type: ignore[no-untyped-call]
self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config)
if DEBUG: wrapped_peft.print_trainable_parameters()
return wrapped_peft, self.tokenizer
def apply_adapter(self,
inference_mode: bool = True,
adapter_type: AdapterType = 'lora',
load_adapters: t.Literal['all'] | list[str] | None = None,
use_cache: bool = True) -> M:
def apply_adapter(self, inference_mode: bool = True, adapter_type: AdapterType = 'lora', load_adapters: t.Literal['all'] | list[str] | None = None, use_cache: bool = True) -> M:
'''Apply given LoRA mapping to the model. Note that the base model can still be accessed via self.model.get_base_model().'''
if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly')
# early out if _adapters_mapping is empty or it is already wrapped with peft.
@@ -828,10 +830,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
adapters_to_load = adapter_mapping.keys() if load_adapters == 'all' else load_adapters
for adapter_name in adapters_to_load:
_peft_config, _peft_model_id = adapter_mapping[adapter_name]
t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id,
adapter_name=adapter_name,
is_trainable=not inference_mode,
**dict(_peft_config.to_dict()))
t.cast(peft.PeftModel, self.__llm_model__).load_adapter(_peft_model_id, adapter_name=adapter_name, is_trainable=not inference_mode, **dict(_peft_config.to_dict()))
return self.__llm_model__
@@ -848,8 +847,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
# the below shared similar logics with `get_peft_model`
# TODO: Support PromptLearningConfig
if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig):
logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
default_config.task_type)
logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", default_config.task_type)
model = peft.PeftModel(self.__llm_model__, default_config)
else:
# XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0
@@ -1041,42 +1039,21 @@ class LLM(LLMInterface[M, T], ReprMixin):
# Prevent yielding partial stop sequence
if not partially_stopped:
yield {
'text': output,
'usage': {
'prompt_tokens': input_echo_len,
'completion_tokens': i,
'total_tokens': input_echo_len + i
},
'finish_reason': None
}
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': None}
if stopped: break
# Finish stream event, which contains finish reason
if i == self.config['max_new_tokens'] - 1: finish_reason = 'length'
elif stopped: finish_reason = 'stop'
else: finish_reason = None
yield {
'text': output,
'usage': {
'prompt_tokens': input_echo_len,
'completion_tokens': i,
'total_tokens': input_echo_len + i
},
'finish_reason': finish_reason
}
yield {'text': output, 'usage': {'prompt_tokens': input_echo_len, 'completion_tokens': i, 'total_tokens': input_echo_len + i}, 'finish_reason': finish_reason}
# Clean
del past_key_values, out
gc.collect()
torch.cuda.empty_cache()
@overload
def Runner(model_name: str,
*,
model_id: str | None = None,
model_version: str | None = ...,
init_local: t.Literal[False, True] = ...,
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
...
@overload
@@ -1158,10 +1135,7 @@ def Runner(model_name: str,
'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
})
backend = t.cast(
LiteralBackend,
first_not_none(backend,
default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
backend = t.cast(LiteralBackend, first_not_none(backend, default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
if init_local: ensure_available = True
runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs)
if init_local: runner.init_local(quiet=True)
@@ -1174,8 +1148,7 @@ class SetAdapterOutput(t.TypedDict):
success: bool
message: str
def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature,
generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
class _Runnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
@@ -1234,7 +1207,8 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
return ' '.join(output_text) + ' '
return types.new_class(
self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({
self.__class__.__name__ + 'Runnable', (_Runnable,), {},
lambda ns: ns.update({
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
'__module__': self.__module__,
'__doc__': self.config['env'].start_docstring
@@ -1281,12 +1255,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
return {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}
def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
yield 'runner_methods', {
method.name: {
'batchable': method.config.batchable,
'batch_dim': method.config.batch_dim if method.config.batchable else None
} for method in __self.runner_methods
}
yield 'runner_methods', {method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods}
yield 'config', self.config.model_dump(flatten=True)
yield 'llm_type', __self.llm_type
yield 'backend', self.__llm_backend__

View File

@@ -15,25 +15,21 @@ if t.TYPE_CHECKING:
from ._llm import LLM
autogptq, torch, transformers = LazyLoader('autogptq', globals(),
'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
logger = logging.getLogger(__name__)
QuantiseMode = t.Literal['int8', 'int4', 'gptq']
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'],
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
...
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'],
**attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
...
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)

View File

@@ -21,8 +21,7 @@ if t.TYPE_CHECKING:
from bentoml._internal.runner.runner import AbstractRunner
from bentoml._internal.runner.runner import RunnerMethod
from openllm_core._typing_compat import TypeAlias
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]],
t.Sequence[openllm.EmbeddingsOutput]]
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
# The following warnings from bitsandbytes, and probably not that important for users to see
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
@@ -44,12 +43,7 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=ru
_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})
@svc.api(route='/v1/generate',
input=_JsonInput,
output=bentoml.io.JSON.from_sample({
'responses': [],
'configuration': llm_config.model_dump(flatten=True)
}))
@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
config = qa_inputs.llm_config.model_dump()
@@ -86,11 +80,32 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
output=bentoml.io.JSON.from_sample({
'embeddings': [
0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752,
0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679,
-0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
-0.014814382418990135, 0.01796768605709076
0.007917795330286026,
-0.014421648345887661,
0.00481307040899992,
0.007331526838243008,
-0.0066398633643984795,
0.00945580005645752,
0.0087016262114048,
-0.010709521360695362,
0.012635177001357079,
0.010541186667978764,
-0.00730888033285737,
-0.001783102168701589,
0.02339819073677063,
-0.010825827717781067,
-0.015888236463069916,
0.01876218430697918,
0.0076906150206923485,
0.0009032754460349679,
-0.010024012066423893,
0.01090280432254076,
-0.008668390102684498,
0.02070549875497818,
0.0014594447566196322,
-0.018775740638375282,
-0.014814382418990135,
0.01796768605709076
],
'num_tokens': 20
}))

View File

@@ -63,11 +63,7 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope
return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
llm_fs: FS,
extra_dependencies: tuple[str, ...] | None = None,
adapter_map: dict[str, str | None] | None = None,
) -> PythonOptions:
def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions:
packages = ['openllm', 'scipy'] # apparently bnb misses this one
if adapter_map is not None: packages += ['openllm[fine-tune]']
# NOTE: add openllm to the default dependencies
@@ -90,8 +86,16 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
elif backend_envvar == 'tf':
if not openllm_core.utils.is_tf_available():
raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow',
'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
candidates = ('tensorflow',
'tensorflow-cpu',
'tensorflow-gpu',
'tf-nightly',
'tf-nightly-cpu',
'tf-nightly-gpu',
'intel-tensorflow',
'intel-tensorflow-avx512',
'tensorflow-rocm',
'tensorflow-macos',
)
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
for candidate in candidates:
@@ -109,10 +113,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
packages.extend([f'torch>={importlib.metadata.version("torch")}'])
wheels: list[str] = []
built_wheels: list[str | None] = [
build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
for p in ('openllm_core', 'openllm_client', 'openllm')
]
built_wheels: list[str |
None] = [build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')]
if all(i for i in built_wheels):
wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
return PythonOptions(packages=packages,
@@ -120,9 +122,14 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
lock_packages=False,
extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/'])
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry,
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
_: FS,
workers_per_resource: float,
quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None,
serialisation_format: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
@@ -145,9 +152,7 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
dockerfile_template=dockerfile_template)
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
OPENLLM_MODEL_NAME = '# openllm: model name'
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
@@ -188,8 +193,7 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N
if OPENLLM_MODEL_NAME in it:
src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
elif OPENLLM_MODEL_ADAPTER_MAP in it:
src_contents[src_contents.index(it)] = (
ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if DEBUG: logger.info('Generated script:\n%s', script)
llm_fs.writetext(llm.config['service_name'], script)
@@ -210,13 +214,7 @@ def create_bento(bento_tag: bentoml.Tag,
_model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
backend_envvar = llm.config['env']['backend_value']
labels = dict(llm.identifying_params)
labels.update({
'_type': llm.llm_type,
'_framework': backend_envvar,
'start_name': llm.config['start_name'],
'base_name_or_path': llm.model_id,
'bundler': 'openllm.bundle'
})
labels.update({'_type': llm.llm_type, '_framework': backend_envvar, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'})
if adapter_map: labels.update(adapter_map)
if isinstance(workers_per_resource, str):
if workers_per_resource == 'round_robin': workers_per_resource = 1.0
@@ -242,8 +240,15 @@ def create_bento(bento_tag: bentoml.Tag,
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template,
serialisation_format, container_registry, container_version_strategy))
docker=construct_docker_options(llm,
llm_fs,
workers_per_resource,
quantize,
adapter_map,
dockerfile_template,
serialisation_format,
container_registry,
container_version_strategy))
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.

View File

@@ -42,11 +42,7 @@ ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
# but in the future, we can infer based on git repo and everything to make it more options for users
# to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
# NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
'docker': 'docker.io/bentoml/openllm',
'gh': 'ghcr.io/bentoml/openllm',
'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'
}
_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {'docker': 'docker.io/bentoml/openllm', 'gh': 'ghcr.io/bentoml/openllm', 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'}
# TODO: support custom fork. Currently it only support openllm main.
_OWNER = 'bentoml'
@@ -82,9 +78,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
commits = t.cast('list[dict[str, t.Any]]', cls._ghapi.repos.list_commits(since=_commit_time_range()))
return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
# now is the correct behaviour
return orjson.loads(
subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
@@ -142,9 +136,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
try:
if not _BUILDER.health(): raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError):
raise RuntimeError(
'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
) from None
raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None
if openllm_core.utils.device_count() == 0:
raise RuntimeError('Building base container requires GPUs (None available)')
if not shutil.which('nvidia-container-runtime'):
@@ -153,8 +145,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
if not registries:
tags: dict[str | LiteralContainerRegistry, str] = {
alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
@@ -171,8 +162,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
quiet=machine)
if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
return tags
if t.TYPE_CHECKING:

View File

@@ -43,35 +43,29 @@ _AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [
sc.CompletionItem(str(it.tag), help='Bento')
for it in bentoml.list()
if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
]
return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool,
environ: DictStrAny) -> DictStrAny:
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
_bentoml_config_options_opts = [
'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}',
'tracing.sample_rate=1.0',
f'api_server.traffic.timeout={server_timeout}',
f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
]
if device:
if len(device) > 1:
_bentoml_config_options_opts.extend(
[f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
_bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else:
_bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
if cors:
_bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend(
[f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
_bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
_bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -123,18 +117,27 @@ Available official model_id(s): [default: {llm_config['default_id']}]
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
# NOTE: The model requires GPU, therefore we will return a dummy command
command_attrs.update({
'short_help': '(Disabled because there is no GPU available)',
'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
})
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
@group.command(**command_attrs)
@start_decorator(llm_config, serve_grpc=_serve_grpc)
@click.pass_context
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'],
cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
def start_cmd(ctx: click.Context,
/,
server_timeout: int,
model_id: str | None,
model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
backend: LiteralBackend,
serialisation_format: t.Literal['safetensors', 'legacy'],
cors: bool,
adapter_id: str | None,
return_process: bool,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
termui.echo(
@@ -202,8 +205,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
cmd_name = f'openllm build {model_name}'
if adapter_map is not None:
cmd_name += ' ' + ' '.join(
[f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
if not openllm.utils.get_quiet_mode():
termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
@@ -242,11 +244,15 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
composed = openllm.utils.compose(
llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
llm_config.to_click_options,
_http_server_args if not serve_grpc else _grpc_server_args,
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
model_id_option(factory=cog.optgroup),
model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup),
workers_per_resource_option(factory=cog.optgroup),
cors_option(factory=cog.optgroup),
backend_option(factory=cog.optgroup),
cog.optgroup.group('LLM Optimization Options',
help='''Optimization related options.
@@ -257,7 +263,9 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
cog.optgroup.option('--device',
type=openllm.utils.dantic.CUDA,
multiple=True,
@@ -375,32 +383,16 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput
**attrs)(f)
def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--cors/--no-cors',
show_default=True,
default=False,
envvar='OPENLLM_CORS',
show_envvar=True,
help='Enable CORS for the server.',
**attrs)(f)
return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-id',
type=click.STRING,
default=None,
envvar='OPENLLM_MODEL_ID',
show_envvar=True,
help='Optional model_id name or path for (fine-tune) weight.',
**attrs)(f)
return cli_option('--model-id', type=click.STRING, default=None, envvar='OPENLLM_MODEL_ID', show_envvar=True, help='Optional model_id name or path for (fine-tune) weight.', **attrs)(f)
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--model-version',
type=click.STRING,
default=None,
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
**attrs)(f)
return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
# NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
@@ -512,8 +504,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
try:
float(value) # type: ignore[arg-type]
except ValueError:
raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx,
param) from None
raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
else:
return value

View File

@@ -83,10 +83,7 @@ def _start(model_name: str,
from .entrypoint import start_command
from .entrypoint import start_grpc_command
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()),
model_id=model_id,
quantize=quantize)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name, backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()), model_id=model_id, quantize=quantize)
os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
args: list[str] = []
@@ -102,9 +99,7 @@ def _start(model_name: str,
if additional_args: args.extend(additional_args)
if __test__: args.append('--return-process')
return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
model_name,
_context_settings=termui.CONTEXT_SETTINGS,
return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS,
_serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
@inject
@@ -199,9 +194,7 @@ def _build(model_name: str,
raise OpenLLMException(str(e)) from None
matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
if matched is None:
raise ValueError(
f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
)
raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
return bentoml.get(matched.group(1), _bento_store=bento_store)
def _import_model(model_name: str,
@@ -256,6 +249,5 @@ def _list_models() -> dict[str, t.Any]:
return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']

View File

@@ -28,14 +28,10 @@ if t.TYPE_CHECKING:
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
''')
@container_registry_option
@click.option('--version-strategy',
type=click.Choice(['release', 'latest', 'nightly']),
default='nightly',
help='Version strategy to use for tagging the image.')
@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool,
machine: bool) -> dict[str, str]:
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
return mapping

View File

@@ -31,9 +31,7 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore
except bentoml.exceptions.NotFound:
ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
ctx.fail(
f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
)
ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
if machine: return bentomodel.path
# copy and paste this into a new shell
if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)

View File

@@ -41,11 +41,6 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
# for the reconstruction of the Dockerfile.
if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
doc = generate_containerfile(docker=DockerOptions(**docker_attrs),
build_ctx=bentomodel.path,
conda=options.conda,
bento_fs=bentomodel._fs,
enable_buildkit=True,
add_header=True)
doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
termui.echo(doc, fg='white')
return bentomodel.path

View File

@@ -18,9 +18,7 @@ from openllm_core._prompt import process_prompt
LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
shell_complete=model_complete_envvar)
@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
@click.argument('prompt', type=click.STRING)
@output_option
@click.option('--format', type=click.STRING, default=None)
@@ -32,8 +30,7 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
callback=opt_callback,
metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any],
**_: t.Any) -> str | None:
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
'''Get the default prompt used by OpenLLM.'''
module = openllm.utils.EnvVarMixin(model_name).module
_memoized = {k: v[0] for k, v in _memoized.items() if v}

View File

@@ -22,17 +22,10 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
'tag': str(b.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
'models': [{
'tag': str(m.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
}
for m in (bentoml.models.get(_.tag)
for _ in b.info.models)]
}
for b in tuple(i
for i in bentoml.list()
if all(k in i.info.labels
for k in {'start_name', 'bundler'}))
if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
} for b in tuple(i for i in bentoml.list() if all(
k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
}
mapping = {k: v for k, v in mapping.items() if v}
if output == 'pretty':

View File

@@ -25,30 +25,17 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
'''This is equivalent to openllm models --show-available less the nice table.'''
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
ids_in_local_store = {
k: [
i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and
'model_name' in i.info.labels and i.info.labels['model_name'] == k
] for k in models
k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k]
for k in models
}
if model_name is not None:
ids_in_local_store = {
k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
for k, v in ids_in_local_store.items()
}
ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
local_models = {
k: [{
'tag': str(i.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(i.path))
} for i in val] for k, val in ids_in_local_store.items()
}
local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
if output == 'pretty':
import tabulate
tabulate.PRESERVE_WHITESPACE = True
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v],
tablefmt='fancy_grid',
headers=['LLM', 'Tag', 'Size']),
fg='white')
termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
else:
termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
return local_models

View File

@@ -153,13 +153,11 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
def keys(self) -> ConfigModelKeysView:
return t.cast('ConfigModelKeysView',
[self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] +
list(self._extra_content.keys()))
[self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys()))
def values(self) -> ConfigModelValuesView:
return t.cast('ConfigModelValuesView',
[self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] +
list(self._extra_content.values()))
[self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values()))
def items(self) -> ConfigModelItemsView:
return t.cast('ConfigModelItemsView',

View File

@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'),
('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'),
('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
class AutoLLM(BaseAutoLLMClass):

View File

@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'),
('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
class AutoVLLM(BaseAutoLLMClass):

View File

@@ -12,36 +12,24 @@ from openllm_core.config.configuration_dolly_v2 import get_special_token_id
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
else:
torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(),
'transformers'), openllm.utils.LazyLoader(
'tf', globals(), 'tensorflow')
'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
logger = logging.getLogger(__name__)
@overload
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[True] = True,
**attrs: t.Any) -> transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
...
@overload
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: t.Literal[False] = ...,
**attrs: t.Any) -> type[transformers.Pipeline]:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
...
def get_pipeline(model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizer,
_init: bool = False,
**attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
# Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
class InstructionTextGenerationPipeline(transformers.Pipeline):
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
def _sanitize_parameters(self,
return_full_text: bool | None = None,
**generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
if t.TYPE_CHECKING: assert self.tokenizer is not None
preprocess_params: dict[str, t.Any] = {}
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
@@ -87,11 +75,7 @@ def get_pipeline(model: transformers.PreTrainedModel,
instruction_text = input_tensors.pop('instruction_text')
return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}
def postprocess(self,
model_outputs: dict[str, t.Any],
response_key_token_id: int,
end_key_token_id: int,
return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
if t.TYPE_CHECKING: assert self.tokenizer is not None
_generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
@@ -149,10 +133,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
self.tokenizer,
_init=True,
return_full_text=self.config.return_full_text)
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
llm_config = self.config.model_construct_env(**attrs)

View File

@@ -18,17 +18,14 @@ class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTraine
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id,
**attrs).to_generation_config()),
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
skip_special_tokens=True)
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -23,16 +23,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
masked_embeddings = data * mask
sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
num_tokens=int(torch.sum(attention_mask).item()))
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -36,10 +36,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32
}, {}
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
import torch
@@ -51,12 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
config=config,
torch_dtype=torch_dtype,
trust_remote_code=trust_remote_code,
device_map=device_map,
**attrs)
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
try:
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
finally:
@@ -67,12 +59,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
device_map = attrs.pop('device_map', None)
trust_remote_code = attrs.pop('trust_remote_code', True)
config = get_mpt_config(self._bentomodel.path,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code,
)
config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
config=config,
trust_remote_code=trust_remote_code,

View File

@@ -16,8 +16,7 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
__openllm_internal__ = True
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(self.tag,
transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
@@ -34,11 +33,7 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
use_default_prompt_template: bool = False,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences,
'repetition_penalty': repetition_penalty
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
}, {}
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:

View File

@@ -11,8 +11,7 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import transformers
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(self.tag,
transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),

View File

@@ -19,8 +19,5 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
use_default_prompt_template: bool = True,
**attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
'max_new_tokens': max_new_tokens,
'temperature': temperature,
'top_k': top_k,
'num_return_sequences': num_return_sequences
'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
}, {}

View File

@@ -18,10 +18,7 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
import torch
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32
}, {}
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import torch
@@ -50,11 +47,9 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -56,18 +56,13 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("falcon",
model_id=model_args.model_id,
quantize="int4",
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
ensure_available=True).prepare_for_training(
adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
model, tokenizer = openllm.AutoLLM.for_model("falcon", model_id=model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16,
ensure_available=True).prepare_for_training(adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token

View File

@@ -98,8 +98,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])
# tokenize and chunk dataset
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True,
remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)
# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")
@@ -180,15 +179,11 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
transformers.set_seed(model_args.seed)
model, tokenizer = prepare_for_int4_training(model_args.model_id,
gradient_checkpointing=training_args.gradient_checkpointing,
bf16=training_args.bf16,
)
model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16,)
datasets = prepare_datasets(tokenizer)
trainer = transformers.Trainer(model=model,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
train_dataset=datasets,
data_collator=transformers.default_data_collator,
)

View File

@@ -56,13 +56,12 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8",
ensure_available=True).prepare_for_training(adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", ensure_available=True).prepare_for_training(adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
# ft on english_quotes
data = load_dataset("Abirate/english_quotes")

View File

@@ -43,13 +43,10 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
try:
tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
except KeyError:
raise openllm.exceptions.OpenLLMException(
"Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
else:
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
trust_remote_code=llm.trust_remote_code,
**tokenizer_attrs)
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
if tokenizer.pad_token_id is None:
if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id

View File

@@ -6,6 +6,4 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
'flax': ('FlaxAutoModelForCausalLM', 'FlaxAutoModelForSeq2SeqLM'),
'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
}
HUB_ATTRS = [
'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token'
]
HUB_ATTRS = ['cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token']

View File

@@ -43,11 +43,7 @@ logger = logging.getLogger(__name__)
__all__ = ['import_model', 'get', 'load_model']
@inject
def import_model(llm: openllm.LLM[M, T],
*decls: t.Any,
trust_remote_code: bool,
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
**attrs: t.Any) -> bentoml.Model:
def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model:
"""Auto detect model type from given model_id and import it to bentoml's model store.
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
@@ -76,8 +72,7 @@ def import_model(llm: openllm.LLM[M, T],
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
signatures['generate'] = {'batchable': False}
@@ -107,8 +102,7 @@ def import_model(llm: openllm.LLM[M, T],
tokenizer.save_pretrained(bentomodel.path)
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
@@ -124,20 +118,13 @@ def import_model(llm: openllm.LLM[M, T],
else:
architectures = getattr(config, 'architectures', [])
if not architectures:
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
architecture = architectures[0]
update_model(bentomodel, metadata={'_pretrained_class': architecture})
if llm._local:
# possible local path
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id,
*decls,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs)
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
@@ -149,8 +136,7 @@ def import_model(llm: openllm.LLM[M, T],
else:
bentomodel.flush() # type: ignore[no-untyped-call]
bentomodel.save(_model_store)
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
openllm.utils.analytics.track(openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
finally:
bentomodel.exit_cloudpickle_context(imported_modules)
# NOTE: We need to free up the cache after importing the model
@@ -171,8 +157,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
if Version(model.info.api_version) < Version('v2'):
raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
if model.info.labels['backend'] != llm.__llm_backend__:
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
return model
except Exception as err:
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
@@ -185,8 +170,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
default=llm._serialisation_format == 'safetensors')
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,

View File

@@ -23,8 +23,7 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
else:
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
_object_setattr = object.__setattr__
@@ -45,11 +44,7 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
if not isinstance(config, transformers.PretrainedConfig):
copied_attrs = copy.deepcopy(attrs)
if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype')
config, attrs = transformers.AutoConfig.from_pretrained(model_id,
return_unused_kwargs=True,
trust_remote_code=trust_remote_code,
**hub_attrs,
**copied_attrs)
config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs)
return config, hub_attrs, attrs
def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
@@ -62,9 +57,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
if llm.config['trust_remote_code']:
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if not hasattr(config, 'auto_map'):
raise ValueError(
f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping'
)
raise ValueError(f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping')
# in case this model doesn't use the correct auto class for model type, for example like chatglm
# where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
if autoclass not in config.auto_map: autoclass = 'AutoModel'
@@ -84,7 +77,8 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
based.update(metadata)
_object_setattr(
bentomodel, '_info',
bentomodel,
'_info',
ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
tag=bentomodel.info.tag,
module=bentomodel.info.module,
@@ -102,9 +96,7 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)
default_config = ModelSignature(batchable=False)
if llm.__llm_backend__ in {'pt', 'vllm'}:
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search',
'constrained_beam_search',
)
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search',)
elif llm.__llm_backend__ == 'tf':
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
else:

View File

@@ -15,10 +15,7 @@ if t.TYPE_CHECKING:
logger = logging.getLogger(__name__)
@contextlib.contextmanager
def build_bento(model: str,
model_id: str | None = None,
quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
logger.info('Building BentoML for %s', model)
bento = openllm.build(model, model_id=model_id, quantize=quantize)
yield bento

View File

@@ -31,20 +31,12 @@ def test_missing_default():
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingArchitecture', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing'],
'requirements': ['bentoml'],
},
)
make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},)
def test_forbidden_access():
cl_ = make_llm_config(
'ForbiddenAccess', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'],
'architecture': 'PreTrainedModel',
'requirements': ['bentoml'],
'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], 'architecture': 'PreTrainedModel', 'requirements': ['bentoml'],
},
)
@@ -77,9 +69,7 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
assert attr.has(cl_)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473),
st.floats(min_value=0.0, max_value=1.0),
)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),)
def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
sent = cl_()
@@ -138,9 +128,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
mk.setenv(field_env_key('field1'), str(4.0))
mk.setenv(field_env_key('temperature', suffix='generation'), str(0.2))
sent = make_llm_config('OverwriteWithEnvAvailable', {
'default_id': 'asdfasdf',
'model_ids': ['asdf', 'asdfasdfads'],
'architecture': 'PreTrainedModel'
'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel'
},
fields=(('field1', 'float', 3.0),),
).model_construct_env(field1=20.0, temperature=0.4)

View File

@@ -73,8 +73,7 @@ class ResponseComparator(JSONSnapshotExtension):
return s == t
def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool:
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and
eq_config(s.marshaled_config, t.marshaled_config))
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config))
return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
@@ -210,8 +209,7 @@ def _container_handle(model: str,
detach=True,
device_requests=devs,
ports={
'3000/tcp': port,
'3001/tcp': prom_port
'3000/tcp': port, '3001/tcp': prom_port
},
)

View File

@@ -49,8 +49,7 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
@pytest.fixture()
def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
file.write_text(
"{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
return file
@pytest.mark.usefixtures('dockerfile_template')