fix(yapf): align weird new lines break [generated] [skip ci] (#284)

fix(yapf): align weird new lines break

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-09-01 05:34:22 -04:00
committed by GitHub
parent 3e45530abd
commit b7af7765d4
91 changed files with 811 additions and 1678 deletions

View File

@@ -26,14 +26,11 @@ else:
# configuration for bitsandbytes before import
_os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1")
# NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
_warnings.filterwarnings(
"ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
_warnings.filterwarnings(
"ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
_warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
_warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
_warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
# NOTE: ignore the following warning from ghapi as it is not important for users
_warnings.filterwarnings("ignore",
message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
_warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
_import_structure: dict[str, list[str]] = {
"exceptions": [],
@@ -48,13 +45,8 @@ _import_structure: dict[str, list[str]] = {
"_quantisation": ["infer_quantisation_config"],
"_embeddings": ["GenericEmbeddingRunnable"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
"_generation": [
"StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
"prepare_logits_processor"
],
"models.auto": [
"MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"
],
"_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
"models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"],
"models.chatglm": [],
"models.baichuan": [],
"models.dolly_v2": [],
@@ -114,8 +106,7 @@ try:
if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_pt_objects"] = [
name for name in dir(utils.dummy_pt_objects)
if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
]
else:
_import_structure["models.flan_t5"].extend(["FlanT5"])

View File

@@ -36,7 +36,6 @@ else:
vllm = LazyLoader('vllm', globals(), 'vllm')
def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
@@ -48,7 +47,6 @@ def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[
return inner
def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
@functools.wraps(fn)
def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
if self.__llm_backend__ == 'vllm':
@@ -71,7 +69,6 @@ def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vll
return inner
def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]:
@functools.wraps(fn)
def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
@@ -79,7 +76,6 @@ def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]],
return inner
def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]:
@functools.wraps(fn)
def inner(self: LLM[M, T]) -> None:
if self.__llm_backend__ == 'pt' and is_torch_available():
@@ -98,8 +94,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
args: ListStr = []
globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM}
# _cached_LLMFunction_get and _ccached_LLMSerialisation_get
globs.update(
{f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
globs.update({f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
# llm_post_init implementation
lines: ListStr = [
f'_impl_{cls.__name__}_func=cls.llm_post_init',
@@ -112,17 +107,13 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
cached_func_name = f'_cached_{cls.__name__}_func'
func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
lines.extend([
f'{cached_func_name}=cls.{func}', func_call,
_setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')
])
lines.extend([f'{cached_func_name}=cls.{func}', func_call, _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')])
# assign vLLM implementation
if cls.__llm_backend__ == 'vllm':
vllm_func = {
f'_vllm_{it}': fn
for it, fn in zip(('generate', 'generate_iterator',
'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
}
globs.update(vllm_func)
lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
@@ -141,8 +132,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
return f'__llm_supports_{key}__'
bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
lines.extend(
[_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
return codegen.generate_function(cls,
'__assign_llm_attr',
@@ -154,8 +144,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
'return': None
})
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
**_: t.Any) -> str:
def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
return generation_result[0]['outputs'][0]['text']
def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
@@ -193,9 +182,7 @@ def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -
if request_id is None: raise ValueError('request_id must not be None.')
outputs: list[vllm.RequestOutput] = []
# TODO: support prompt_token_ids
self.model.add_request(request_id=request_id,
prompt=prompt,
sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
while self.model.has_unfinished_requests():
outputs.extend([r for r in self.model.step() if r.finished])
return [unmarshal_vllm_outputs(i) for i in outputs]

View File

@@ -25,9 +25,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
return bentoml.transformers.get(ids)
except bentoml.exceptions.NotFound:
model_signatures = {
k: ModelSignature(batchable=False)
for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
'group_beam_search', 'constrained_beam_search', '__call__')
k: ModelSignature(batchable=False) for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search',
'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
}
with bentoml.models.create(ids,
module=MODULE_NAME,
@@ -39,11 +38,10 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
'framework': 'openllm'
},
signatures=model_signatures) as bentomodel:
snapshot_download(
_GENERIC_EMBEDDING_ID,
local_dir=bentomodel.path,
local_dir_use_symlinks=False,
ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
snapshot_download(_GENERIC_EMBEDDING_ID,
local_dir=bentomodel.path,
local_dir_use_symlinks=False,
ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
return bentomodel
class GenericEmbeddingRunnable(bentoml.Runnable):
@@ -68,10 +66,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
model_output = self.model(**encoded_input)
# Perform pooling and normalize
sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
return [
openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(),
num_tokens=int(torch.sum(attention_mask).item()))
]
return [openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
@staticmethod
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:

View File

@@ -14,18 +14,15 @@ LogitsProcessorList = transformers.LogitsProcessorList
StoppingCriteriaList = transformers.StoppingCriteriaList
class StopSequenceCriteria(transformers.StoppingCriteria):
def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer |
transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
def __init__(self, stop_sequences: str | list[str],
tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
return any(
self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
class StopOnTokens(transformers.StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}

View File

@@ -122,7 +122,6 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp
_reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'}
class LLMFunction(abc.ABC):
@abc.abstractmethod
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
'''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
@@ -158,8 +157,7 @@ class LLMFunction(abc.ABC):
'''
raise NotImplementedError
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
'''The entrypoint for generating one prompt.
This provides additional stop tokens for generating per token level. This is useful when running with agents, or initial streaming support.
@@ -177,7 +175,6 @@ class LLMFunction(abc.ABC):
raise NotImplementedError
class LLMSerialisation(abc.ABC, t.Generic[M, T]):
def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model:
'''Import both model and tokenizer weights into as a BentoML models.
@@ -206,7 +203,6 @@ class LLMSerialisation(abc.ABC, t.Generic[M, T]):
raise NotImplementedError
class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC):
def llm_post_init(self) -> None:
'''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.
By default, this will add `self.device` if the implementation is PyTorch.
@@ -282,12 +278,12 @@ class LLM(LLMInterface[M, T], ReprMixin):
if t.TYPE_CHECKING: __name__: str
if t.TYPE_CHECKING and not MYPY:
def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig,
autogptq.BaseQuantizeConfig]],
model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny,
tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None:
def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag,
adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4',
'gptq']], serialisation_format: t.Literal['safetensors',
'legacy'], _local: bool, **attrs: t.Any) -> None:
'''Generated __attrs_init__ for openllm.LLM.'''
config: LLMConfig
@@ -434,20 +430,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
'''
cfg_cls = cls.config_class
_local = False
_model_id: str = first_not_none(model_id,
os.environ.get(cfg_cls.__openllm_env__['model_id']),
default=cfg_cls.__openllm_default_id__)
_model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
quantize = first_not_none(quantize,
t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']],
os.environ.get(cfg_cls.__openllm_env__['quantize'])),
t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])),
default=None)
# quantization setup
if quantization_config and quantize:
raise ValueError(
"'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument."
)
"'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
if quantization_config is None and quantize is not None:
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
if quantize == 'gptq': serialisation = 'safetensors'
@@ -460,9 +452,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
)
if adapter_map is None and adapter_id is not None: adapter_map = {adapter_id: adapter_name}
if adapter_map is not None and not is_peft_available():
raise RuntimeError(
"LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
)
raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
if adapter_map: logger.debug('OpenLLM will apply the following adapters layers: %s', list(adapter_map))
if llm_config is None:
@@ -517,16 +507,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
model_id, *maybe_revision = model_id.rsplit(':')
if len(maybe_revision) > 0:
if model_version is not None:
logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",
maybe_revision[0], model_version)
logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
return f'{cls.__llm_backend__}-{model_name}:{maybe_revision[0]}'
tag_name = f'{cls.__llm_backend__}-{model_name}'
if openllm_core.utils.check_bool_env('OPENLLM_USE_LOCAL_LATEST', False):
return str(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
if validate_is_path(model_id):
model_id, model_version = resolve_filepath(model_id), first_not_none(model_version,
default=generate_hash_from_file(model_id))
model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
else:
from .serialisation.transformers._helpers import process_config
model_version = getattr(
@@ -542,11 +530,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
_adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag,
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag, _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
_serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any,
):
):
'''Initialize the LLM with given pretrained model.
> [!WARNING]
@@ -662,8 +649,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
@property
def trust_remote_code(self) -> bool:
return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'),
default=self.config['trust_remote_code'])
return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'), default=self.config['trust_remote_code'])
@property
def adapters_mapping(self) -> AdaptersMapping | None:
@@ -698,10 +684,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
@property
def identifying_params(self) -> DictStrAny:
return {
'configuration': self.config.model_dump_json().decode(),
'model_ids': orjson.dumps(self.config['model_ids']).decode()
}
return {'configuration': self.config.model_dump_json().decode(), 'model_ids': orjson.dumps(self.config['model_ids']).decode()}
@property
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
@@ -755,8 +738,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
model = self.load_model(*self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
if self.__llm_backend__ == 'pt' and is_torch_available():
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(
model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(
model, 'is_quantized', False)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
model = model.to('cuda')
@@ -785,24 +768,20 @@ class LLM(LLMInterface[M, T], ReprMixin):
_converted_first_none = False
for _adapter_type, _adapters_tuples in self._adapters_mapping.items():
strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type),
default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
llm_config_class=self.config_class))
default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), llm_config_class=self.config_class))
default_config = strategy.eval() if inference_mode else strategy.train()
for adapter in _adapters_tuples:
if not adapter.name and _converted_first_none:
raise ValueError(
f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}"
)
raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}")
name = adapter.name
if name is None:
_converted_first_none = True
name = 'default'
peft_config = default_config.with_config(
**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(
adapter_type=t.cast('PeftType', _adapter_type),
adapter_config=adapter.config,
inference_mode=inference_mode,
llm_config_class=self.config_class).to_peft_config()
**adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
adapter_config=adapter.config,
inference_mode=inference_mode,
llm_config_class=self.config_class).to_peft_config()
adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map
return adapter_map
@@ -834,8 +813,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
_mapping = self._transpose_adapter_mapping(inference_mode=inference_mode, use_cache=use_cache)
if adapter_type not in _mapping:
raise ValueError(
f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}')
raise ValueError(f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}')
adapter_mapping = _mapping[adapter_type]
self.__llm_model__ = self._wrap_default_peft_model(adapter_mapping, inference_mode=inference_mode)
@@ -857,25 +835,21 @@ class LLM(LLMInterface[M, T], ReprMixin):
return self.__llm_model__
def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]],
inference_mode: bool) -> M:
def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]], inference_mode: bool) -> M:
if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly')
if isinstance(self.__llm_model__, peft.PeftModel): return self.__llm_model__
if not isinstance(self.__llm_model__, transformers.PreTrainedModel):
raise ValueError('Loading LoRA layers currently only runs on PyTorch models.')
if 'default' not in adapter_mapping:
raise ValueError(
"There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.")
raise ValueError("There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.")
default_config, peft_model_id = adapter_mapping.pop('default')
# the below shared similar logics with `get_peft_model`
# TODO: Support PromptLearningConfig
if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(
default_config, peft.PromptLearningConfig):
logger.debug(
"Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
default_config.task_type)
if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig):
logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
default_config.task_type)
model = peft.PeftModel(self.__llm_model__, default_config)
else:
# XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0
@@ -894,12 +868,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
# order of these fields matter here, make sure to sync it with
# openllm.models.auto.factory.BaseAutoLLMClass.for_model
def to_runner(
self,
models: list[bentoml.Model] | None = None,
max_batch_size: int | None = None,
max_latency_ms: int | None = None,
scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
def to_runner(self,
models: list[bentoml.Model] | None = None,
max_batch_size: int | None = None,
max_latency_ms: int | None = None,
scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
'''Convert this LLM into a Runner.
Args:
@@ -1047,10 +1020,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
else:
tmp_output_ids = output_ids[input_echo_len:]
rfind_start = 0
output = self.tokenizer.decode(tmp_output_ids,
skip_special_tokens=True,
spaces_between_special_tokens=False,
clean_up_tokenization_spaces=True)
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
partially_stopped = False
if stop:
@@ -1183,25 +1153,17 @@ def Runner(model_name: str,
'''
if llm_config is not None:
attrs.update({
'model_id':
llm_config['env']['model_id_value'],
'quantize':
llm_config['env']['quantize_value'],
'serialisation':
first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
'model_id': llm_config['env']['model_id_value'],
'quantize': llm_config['env']['quantize_value'],
'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
})
backend = t.cast(
LiteralBackend,
first_not_none(backend,
default=EnvVarMixin(
model_name,
backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
if init_local: ensure_available = True
runner = infer_auto_class(backend).create_runner(model_name,
llm_config=llm_config,
ensure_available=ensure_available,
**attrs)
runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs)
if init_local: runner.init_local(quiet=True)
return runner
@@ -1214,7 +1176,6 @@ class SetAdapterOutput(t.TypedDict):
def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature,
generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
class _Runnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
SUPPORTS_CPU_MULTI_THREADING = True
@@ -1252,8 +1213,7 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
return self.generate(prompt, **attrs)
@bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore
def generate_one(__self: _Runnable, prompt: str, stop: list[str],
**attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
adapter_name = attrs.pop('adapter_name', None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate_one(prompt, stop, **attrs)
@@ -1275,22 +1235,15 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
return types.new_class(
self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu')
if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
'__module__':
self.__module__,
'__doc__':
self.config['env'].start_docstring
'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
'__module__': self.__module__,
'__doc__': self.config['env'].start_docstring
}))
def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput:
if not is_peft_available():
return PeftAdapterOutput(
success=False,
result={},
error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'")
return PeftAdapterOutput(success=False, result={}, error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'")
if self.__llm_adapter_map__ is None:
return PeftAdapterOutput(success=False, result={}, error_msg='No adapters available for current running server.')
if not isinstance(self.model, peft.PeftModel):

View File

@@ -16,8 +16,7 @@ if t.TYPE_CHECKING:
from ._llm import LLM
autogptq, torch, transformers = LazyLoader('autogptq', globals(),
'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader(
'transformers', globals(), 'transformers')
'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
logger = logging.getLogger(__name__)
@@ -33,9 +32,8 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal[
**attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
...
def infer_quantisation_config(
cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
**attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -61,7 +59,7 @@ def infer_quantisation_config(
llm_int8_threshhold=int8_threshold,
llm_int8_skip_modules=int8_skip_modules,
llm_int8_has_fp16_weight=int8_has_fp16_weight,
)
)
# 4 bit configuration
int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
@@ -72,9 +70,7 @@ def infer_quantisation_config(
# quantize is a openllm.LLM feature, where we can quantize the model
# with bitsandbytes or quantization aware training.
if not is_bitsandbytes_available():
raise RuntimeError(
"Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
)
raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
if is_transformers_supports_kbit():

View File

@@ -21,35 +21,28 @@ if t.TYPE_CHECKING:
from bentoml._internal.runner.runner import AbstractRunner
from bentoml._internal.runner.runner import RunnerMethod
from openllm_core._typing_compat import TypeAlias
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]],
[t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
_EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]],
t.Sequence[openllm.EmbeddingsOutput]]
# The following warnings from bitsandbytes, and probably not that important for users to see
warnings.filterwarnings('ignore',
message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
warnings.filterwarnings('ignore',
message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
model = os.environ.get('OPENLLM_MODEL', '{__model_name__}') # openllm: model name
adapter_map = os.environ.get('OPENLLM_ADAPTER_MAP', '''{__model_adapter_map__}''') # openllm: model adapter map
llm_config = openllm.AutoConfig.for_model(model)
runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
generic_embedding_runner = bentoml.Runner(
openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type
name='llm-generic-embedding',
scheduling_strategy=openllm_core.CascadingResourceStrategy,
max_batch_size=32,
max_latency_ms=300)
generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type
name='llm-generic-embedding',
scheduling_strategy=openllm_core.CascadingResourceStrategy,
max_batch_size=32,
max_latency_ms=300)
runners: list[AbstractRunner] = [runner]
if not runner.supports_embeddings: runners.append(generic_embedding_runner)
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
_JsonInput = bentoml.io.JSON.from_sample({
'prompt': '',
'llm_config': llm_config.model_dump(flatten=True),
'adapter_name': None
})
_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})
@svc.api(route='/v1/generate',
input=_JsonInput,
@@ -67,10 +60,7 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
echo = input_dict.pop('echo', False)
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
return runner.generate_iterator.async_stream(qa_inputs.prompt,
adapter_name=qa_inputs.adapter_name,
echo=echo,
**qa_inputs.llm_config.model_dump())
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
@svc.api(route='/v1/metadata',
input=bentoml.io.Text(),
@@ -96,12 +86,10 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
output=bentoml.io.JSON.from_sample({
'embeddings': [
0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008,
-0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362,
0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918,
0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076,
-0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752,
0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679,
-0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
-0.014814382418990135, 0.01796768605709076
],
'num_tokens': 20
@@ -121,8 +109,7 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
raise openllm.exceptions.OpenLLMException(f'Invalid JSON input received: {err}') from None
stop = input_data.parameters.pop('stop', ['\n'])
try:
return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters),
status_code=200)
return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200)
except NotImplementedError:
return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)

View File

@@ -10,10 +10,7 @@ from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {
'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
'oci': [
'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name',
'supported_registries', 'RefResolver'
]
'oci': ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
}
if t.TYPE_CHECKING:

View File

@@ -43,8 +43,7 @@ logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
def build_editable(path: str,
package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
'''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None
# We need to build the package in editable mode, so that we can import it
@@ -52,9 +51,7 @@ def build_editable(path: str,
from build.env import IsolatedEnvBuilder
module_location = openllm_core.utils.pkg.source_locations(package)
if not module_location:
raise RuntimeError(
'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
)
raise RuntimeError('Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.')
pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
if os.path.isfile(pyproject_path.__fspath__()):
logger.info('Generating built wheels for package %s...', package)
@@ -64,14 +61,13 @@ def build_editable(path: str,
builder.scripts_dir = env.scripts_dir
env.install(builder.build_system_requires)
return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
raise RuntimeError(
'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
llm_fs: FS,
extra_dependencies: tuple[str, ...] | None = None,
adapter_map: dict[str, str | None] | None = None,
) -> PythonOptions:
) -> PythonOptions:
packages = ['openllm', 'scipy'] # apparently bnb misses this one
if adapter_map is not None: packages += ['openllm[fine-tune]']
# NOTE: add openllm to the default dependencies
@@ -90,16 +86,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
if backend_envvar == 'flax':
if not openllm_core.utils.is_flax_available():
raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'")
packages.extend(
[importlib.metadata.version('flax'),
importlib.metadata.version('jax'),
importlib.metadata.version('jaxlib')])
packages.extend([importlib.metadata.version('flax'), importlib.metadata.version('jax'), importlib.metadata.version('jaxlib')])
elif backend_envvar == 'tf':
if not openllm_core.utils.is_tf_available():
raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
)
candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow',
'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
)
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
for candidate in candidates:
try:
@@ -125,15 +118,11 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
return PythonOptions(packages=packages,
wheels=wheels,
lock_packages=False,
extra_index_url=[
'https://download.pytorch.org/whl/cu118',
'https://huggingface.github.io/autogptq-index/whl/cu118/'
])
extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/'])
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
@@ -156,10 +145,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
_env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)
if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
return DockerOptions(
base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
dockerfile_template=dockerfile_template)
return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
env=env_dict,
dockerfile_template=dockerfile_template)
OPENLLM_MODEL_NAME = '# openllm: model name'
OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
@@ -193,17 +181,15 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
from openllm_core.utils import DEBUG
model_name = llm.config['model_name']
logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'],
llm_fs.getsyspath('/'))
logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/'))
with open(_service_file.__fspath__(), 'r') as f:
src_contents = f.readlines()
for it in src_contents:
if OPENLLM_MODEL_NAME in it:
src_contents[src_contents.index(it)] = (
ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
elif OPENLLM_MODEL_ADAPTER_MAP in it:
src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(
orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
src_contents[src_contents.index(it)] = (
ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
if DEBUG: logger.info('Generated script:\n%s', script)
llm_fs.writetext(llm.config['service_name'], script)
@@ -235,14 +221,12 @@ def create_bento(bento_tag: bentoml.Tag,
if isinstance(workers_per_resource, str):
if workers_per_resource == 'round_robin': workers_per_resource = 1.0
elif workers_per_resource == 'conserved':
workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 /
openllm_core.utils.device_count())
workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
else:
try:
workers_per_resource = float(workers_per_resource)
except ValueError:
raise ValueError(
"'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
elif isinstance(workers_per_resource, int):
workers_per_resource = float(workers_per_resource)
logger.info("Building Bento for '%s'", llm.config['start_name'])
@@ -258,10 +242,8 @@ def create_bento(bento_tag: bentoml.Tag,
exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
models=[llm_spec],
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
adapter_map, dockerfile_template,
serialisation_format, container_registry,
container_version_strategy))
docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template,
serialisation_format, container_registry, container_version_strategy))
bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
# NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.

View File

@@ -68,8 +68,7 @@ def _commit_time_range(r: int = 5) -> str:
class VersionNotSupported(openllm.exceptions.OpenLLMException):
"""Raised when the stable release is too low that it doesn't include OpenLLM base container."""
_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple',
['git_hash', 'version', 'strategy'])
_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
def nightly_resolver(cls: type[RefResolver]) -> str:
# NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -84,10 +83,8 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
# now is the correct behaviour
return orjson.loads(
subprocess.check_output([
docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
'docker://ghcr.io/bentoml/openllm'
]).decode().strip())['Tags'][-2]
subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
@@ -107,20 +104,16 @@ class RefResolver:
# NOTE: This strategy will only support openllm>0.2.12
meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release()
version_str = meta['name'].lstrip('v')
version: tuple[str,
str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
else:
version = ('', version_str)
if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12):
raise VersionNotSupported(
f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'"
)
raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))
@classmethod
@functools.lru_cache(maxsize=64)
def from_strategy(cls,
strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
def from_strategy(cls, strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
# using default strategy
if strategy_or_version is None or strategy_or_version == 'release': return cls(*cls._release_ref())
elif strategy_or_version == 'latest': return cls('latest', '0.0.0', 'latest')
@@ -128,8 +121,7 @@ class RefResolver:
_ref = cls._nightly_ref()
return cls(_ref[0], '0.0.0', _ref[-1])
else:
logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.',
strategy_or_version)
logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', strategy_or_version)
return cls(*cls._release_ref(version_str=strategy_or_version))
@property
@@ -162,8 +154,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
if not pyproject_path.exists():
raise ValueError(
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
)
"This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
if not registries:
tags: dict[str | LiteralContainerRegistry, str] = {
alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
@@ -181,18 +172,14 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip()
except Exception as err:
raise openllm.exceptions.OpenLLMException(
f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}'
) from err
f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
return tags
if t.TYPE_CHECKING:
CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
supported_registries: list[str]
__all__ = [
'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries',
'RefResolver'
]
__all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
def __dir__() -> list[str]:
return sorted(__all__)

View File

@@ -50,14 +50,10 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
]
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [
sc.CompletionItem(inflection.dasherize(it), help='Model')
for it in openllm.CONFIG_MAPPING
if it.startswith(incomplete)
]
return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float,
device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool,
environ: DictStrAny) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
_bentoml_config_options_opts = [
@@ -67,22 +63,15 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
]
if device:
if len(device) > 1:
_bentoml_config_options_opts.extend([
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
for idx, dev in enumerate(device)
])
_bentoml_config_options_opts.extend(
[f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
else:
_bentoml_config_options_opts.append(
f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(
f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
_bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
_bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
if cors:
_bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend(
['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
_bentoml_config_options_opts.extend([
f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
])
[f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
_bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -104,17 +93,13 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
return None
def start_command_factory(group: click.Group,
model: str,
_context_settings: DictStrAny | None = None,
_serve_grpc: bool = False) -> click.Command:
def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
llm_config = openllm.AutoConfig.for_model(model)
command_attrs: DictStrAny = dict(
name=llm_config['model_name'],
context_settings=_context_settings or termui.CONTEXT_SETTINGS,
short_help=f"Start a LLMServer for '{model}'",
aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None,
help=f'''\
command_attrs: DictStrAny = dict(name=llm_config['model_name'],
context_settings=_context_settings or termui.CONTEXT_SETTINGS,
short_help=f"Start a LLMServer for '{model}'",
aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None,
help=f'''\
{llm_config['env'].start_docstring}
\b
@@ -133,15 +118,13 @@ Available official model_id(s): [default: {llm_config['default_id']}]
\b
{orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
''',
)
)
if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
# NOTE: The model requires GPU, therefore we will return a dummy command
command_attrs.update({
'short_help':
'(Disabled because there is no GPU available)',
'help':
f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
'short_help': '(Disabled because there is no GPU available)',
'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
})
return noop_command(group, llm_config, _serve_grpc, **command_attrs)
@@ -150,12 +133,10 @@ Available official model_id(s): [default: {llm_config['default_id']}]
@click.pass_context
def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend,
serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None,
return_process: bool, **attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env(
'OPENLLM_SERIALIZATION_WARNING'):
quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'],
cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
termui.echo(
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
fg='yellow')
@@ -184,10 +165,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
wpr = float(wpr)
# Create a new model env to work with the envvar during CLI invocation
env = openllm.utils.EnvVarMixin(config['model_name'],
backend,
model_id=model_id or config['default_id'],
quantize=quantize)
env = openllm.utils.EnvVarMixin(config['model_name'], backend, model_id=model_id or config['default_id'], quantize=quantize)
requirements = llm_config['requirements']
if requirements is not None and len(requirements) > 0:
missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
@@ -218,17 +196,14 @@ Available official model_id(s): [default: {llm_config['default_id']}]
serialisation=serialisation_format)
start_env.update({env.config: llm.config.model_dump_json().decode()})
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
'_service:svc', **server_attrs)
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
openllm.utils.analytics.track_start_init(llm.config)
def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
cmd_name = f'openllm build {model_name}'
if adapter_map is not None:
cmd_name += ' ' + ' '.join([
f'--adapter-id {s}'
for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
])
cmd_name += ' ' + ' '.join(
[f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
if not openllm.utils.get_quiet_mode():
termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
@@ -265,17 +240,13 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
return noop
def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
def wrapper(fn: FC) -> t.Callable[[FC], FC]:
composed = openllm.utils.compose(
llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
cog.optgroup.group(
'General LLM Options',
help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
backend_option(factory=cog.optgroup),
workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup),
cog.optgroup.group('LLM Optimization Options',
help='''Optimization related options.
@@ -286,7 +257,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
cog.optgroup.option('--device',
type=openllm.utils.dantic.CUDA,
multiple=True,
@@ -312,8 +283,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
'''),
cog.optgroup.option('--adapter-id',
default=None,
help='Optional name or path for given LoRA adapter' +
f" to wrap '{llm_config['model_name']}'",
help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
multiple=True,
callback=_id_callback,
metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'),
@@ -323,8 +293,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
return wrapper
def parse_device_callback(ctx: click.Context, param: click.Parameter,
value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
if value is None: return value
if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -342,19 +311,15 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
from bentoml_cli.cli import cli
command = 'serve' if not serve_grpc else 'serve-grpc'
group = cog.optgroup.group(
f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
)
group = cog.optgroup.group(f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
)
def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
serve_command = cli.commands[command]
# The first variable is the argument bento
# The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
serve_options = [
p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
if p.name not in _IGNORED_OPTIONS
]
serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
for options in reversed(serve_options):
attrs = options.to_info_dict()
# we don't need param_type_name, since it should all be options
@@ -391,10 +356,7 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
cli_option = functools.partial(_click_factory_type, attr='option')
cli_argument = functools.partial(_click_factory_type, attr='argument')
def output_option(f: _AnyCallable | None = None,
*,
default_value: LiteralOutput = 'pretty',
**attrs: t.Any) -> t.Callable[[FC], FC]:
def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
output = ['json', 'pretty', 'porcelain']
def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
@@ -434,12 +396,11 @@ def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable
**attrs)(f)
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--model-version',
type=click.STRING,
default=None,
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
**attrs)(f)
return cli_option('--model-version',
type=click.STRING,
default=None,
help='Optional model version to save for this model. It will be inferred automatically from model-id.',
**attrs)(f)
def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
# NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
@@ -453,10 +414,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
**attrs)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_argument('model_name',
type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
required=required,
**attrs)(f)
return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--quantise',
@@ -482,10 +440,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
> [!NOTE] that quantization are currently only available in *PyTorch* models.''',
**attrs)(f)
def workers_per_resource_option(f: _AnyCallable | None = None,
*,
build: bool = False,
**attrs: t.Any) -> t.Callable[[FC], FC]:
def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--workers-per-resource',
default=None,
callback=workers_per_resource_callback,
@@ -536,18 +491,16 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
**attrs)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
'--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help=
'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
**attrs)(f)
return cli_option('--container-registry',
'container_registry',
type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
default='ecr',
show_default=True,
show_envvar=True,
envvar='OPENLLM_CONTAINER_REGISTRY',
callback=container_registry_callback,
help='The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
**attrs)(f)
_wpr_strategies = {'round_robin', 'conserved'}
@@ -559,9 +512,8 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
try:
float(value) # type: ignore[arg-type]
except ValueError:
raise click.BadParameter(
f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
ctx, param) from None
raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx,
param) from None
else:
return value

View File

@@ -84,8 +84,7 @@ def _start(model_name: str,
from .entrypoint import start_grpc_command
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
backend=openllm_core.utils.first_not_none(
backend, default=llm_config.default_backend()),
backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()),
model_id=model_id,
quantize=quantize)
os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
@@ -94,26 +93,19 @@ def _start(model_name: str,
if model_id: args.extend(['--model-id', model_id])
if timeout: args.extend(['--server-timeout', str(timeout)])
if workers_per_resource:
args.extend([
'--workers-per-resource',
str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
])
args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
if quantize: args.extend(['--quantize', str(quantize)])
if cors: args.append('--cors')
if adapter_map:
args.extend(
list(
itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()
])))
args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
if additional_args: args.extend(additional_args)
if __test__: args.append('--return-process')
return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
model_name,
_context_settings=termui.CONTEXT_SETTINGS,
_serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None,
standalone_mode=False)
_serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)
@inject
def _build(model_name: str,
@@ -180,9 +172,7 @@ def _build(model_name: str,
Returns:
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args: list[str] = [
sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format
]
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format]
if quantize: args.extend(['--quantize', quantize])
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
if push: args.extend(['--push'])
@@ -265,8 +255,7 @@ def _list_models() -> dict[str, t.Any]:
from .entrypoint import models_command
return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']

View File

@@ -34,8 +34,8 @@ if t.TYPE_CHECKING:
help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None,
version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool,
machine: bool) -> dict[str, str]:
mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
return mapping

View File

@@ -24,10 +24,7 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
def cli(ctx: click.Context,
bento: str,
machine: bool,
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
'''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
try:
bentomodel = _bento_store.get(bento)

View File

@@ -19,9 +19,7 @@ from openllm_core.utils import bentoml_cattr
if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
@click.command('get_containerfile',
context_settings=termui.CONTEXT_SETTINGS,
help='Return Containerfile of any given Bento.')
@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject

View File

@@ -32,8 +32,8 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
callback=opt_callback,
metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool,
_memoized: dict[str, t.Any], **_: t.Any) -> str | None:
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any],
**_: t.Any) -> str | None:
'''Get the default prompt used by OpenLLM.'''
module = openllm.utils.EnvVarMixin(model_name).module
_memoized = {k: v[0] for k, v in _memoized.items() if v}
@@ -46,15 +46,11 @@ def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None,
if format is None:
if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None:
raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
raise click.BadOptionUsage(
'format',
f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
if prompt_mapping is None:
raise click.BadArgumentUsage(
f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
if format not in prompt_mapping:
raise click.BadOptionUsage(
'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
_prompt_template = template(format)
else:
_prompt_template = template

View File

@@ -19,26 +19,28 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
'''List available bentos built by OpenLLM.'''
mapping = {
k: [{
'tag':
str(b.tag),
'size':
human_readable_size(openllm.utils.calc_dir_size(b.path)),
'tag': str(b.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
'models': [{
'tag': str(m.tag),
'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
} for b in tuple(i for i in bentoml.list() if all(
k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k
] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
}
for m in (bentoml.models.get(_.tag)
for _ in b.info.models)]
}
for b in tuple(i
for i in bentoml.list()
if all(k in i.info.labels
for k in {'start_name', 'bundler'}))
if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
}
mapping = {k: v for k, v in mapping.items() if v}
if output == 'pretty':
import tabulate
tabulate.PRESERVE_WHITESPACE = True
termui.echo(tabulate.tabulate(
[(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
tablefmt='fancy_grid',
headers=['LLM', 'Tag', 'Size', 'Models']),
termui.echo(tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
tablefmt='fancy_grid',
headers=['LLM', 'Tag', 'Size', 'Models']),
fg='white')
else:
termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')

View File

@@ -26,17 +26,14 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
ids_in_local_store = {
k: [
i for i in bentoml.models.list() if 'framework' in i.info.labels and
i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and
'model_name' in i.info.labels and i.info.labels['model_name'] == k
] for k in models
}
if model_name is not None:
ids_in_local_store = {
k: [
i
for i in v
if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
] for k, v in ids_in_local_store.items()
k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
for k, v in ids_in_local_store.items()
}
ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
local_models = {

View File

@@ -34,12 +34,7 @@ def load_notebook_metadata() -> DictStrAny:
@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
@click.option('--port',
envvar='JUPYTER_PORT',
show_envvar=True,
show_default=True,
default=8888,
help='Default port for Jupyter server')
@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
@click.pass_context
def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
"""OpenLLM Playground.
@@ -60,9 +55,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
> This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
"""
if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
raise RuntimeError(
"Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
)
raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
metadata = load_notebook_metadata()
_temp_dir = False
if output_dir is None:
@@ -74,8 +67,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
for module in pkgutil.iter_modules(playground.__path__):
if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
logger.debug('Skipping: %s (%s)', module.name,
'File already exists' if not module.ispkg else f'{module.name} is a module')
logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
continue
if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
termui.echo('Generating notebook for: ' + module.name, fg='magenta')
@@ -84,10 +76,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
f.cells.insert(0, markdown_cell)
jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
try:
subprocess.check_output([
sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port',
str(port), '--no-browser', '--debug'
])
subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug'])
except subprocess.CalledProcessError as e:
termui.echo(e.output, fg='red')
raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None

View File

@@ -16,9 +16,5 @@ def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.An
t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
CONTEXT_SETTINGS: DictStrAny = {
'help_option_names': ['-h', '--help'],
'max_content_width': COLUMNS,
'token_normalize_func': inflection.underscore
}
CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']

View File

@@ -30,9 +30,7 @@ class BaseAutoLLMClass:
_model_mapping: t.ClassVar[_LazyAutoMapping]
def __init__(self, *args: t.Any, **attrs: t.Any):
raise EnvironmentError(
f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead."
)
raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
@classmethod
def for_model(cls,
@@ -50,10 +48,7 @@ class BaseAutoLLMClass:
>>> llm = openllm.AutoLLM.for_model("flan-t5")
```
'''
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id,
model_version=model_version,
llm_config=llm_config,
**attrs)
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
if ensure_available: llm.ensure_model_id_exists()
return llm
@@ -116,9 +111,7 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
This OrderedDict values() and keys() returns the list instead, so you don't
have to do list(mapping.values()) to get the list of values.
"""
def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString],
model_mapping: OrderedDict[LiteralString, LiteralString]):
def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
self._config_mapping = config_mapping
self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
self._model_mapping = model_mapping
@@ -153,32 +146,26 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
return ReprMixin.__repr__(self)
def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
yield from ((key, (value, self._model_mapping[key]))
for key, value in self._config_mapping.items()
if key in self._model_mapping)
yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)
def __bool__(self) -> bool:
return bool(self.keys())
def keys(self) -> ConfigModelKeysView:
return t.cast('ConfigModelKeysView', [
self._load_attr_from_module(key, name)
for key, name in self._config_mapping.items()
if key in self._model_mapping.keys()
] + list(self._extra_content.keys()))
return t.cast('ConfigModelKeysView',
[self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] +
list(self._extra_content.keys()))
def values(self) -> ConfigModelValuesView:
return t.cast('ConfigModelValuesView', [
self._load_attr_from_module(key, name)
for key, name in self._model_mapping.items()
if key in self._config_mapping.keys()
] + list(self._extra_content.values()))
return t.cast('ConfigModelValuesView',
[self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] +
list(self._extra_content.values()))
def items(self) -> ConfigModelItemsView:
return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(
key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
for key in self._model_mapping.keys()
if key in self._config_mapping.keys()] + list(self._extra_content.items()))
return t.cast('ConfigModelItemsView',
[(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
for key in self._model_mapping.keys()
if key in self._config_mapping.keys()] + list(self._extra_content.items()))
def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))

View File

@@ -7,10 +7,9 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'),
('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'),
('baichuan', 'Baichuan')])
MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'),
('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'),
('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
class AutoLLM(BaseAutoLLMClass):

View File

@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass
from .factory import _LazyAutoMapping
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'),
('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'),
('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)

View File

@@ -11,6 +11,5 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai
import torch
inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
outputs = self.model.generate(**inputs,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

View File

@@ -14,9 +14,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
self.model.eval()
# Only use half precision if the model is not yet quantized
if self.config.use_half_precision: self.model.half()
return self.model.chat(self.tokenizer,
prompt,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
import torch

View File

@@ -11,8 +11,9 @@ from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
from openllm_core.config.configuration_dolly_v2 import get_special_token_id
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
else:
torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader(
'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(),
'transformers'), openllm.utils.LazyLoader(
'tf', globals(), 'tensorflow')
logger = logging.getLogger(__name__)
@overload
@@ -35,22 +36,8 @@ def get_pipeline(model: transformers.PreTrainedModel,
**attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
# Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
class InstructionTextGenerationPipeline(transformers.Pipeline):
def __init__(self,
*args: t.Any,
do_sample: bool = True,
max_new_tokens: int = 256,
top_p: float = 0.92,
top_k: int = 0,
**kwargs: t.Any):
super().__init__(*args,
model=model,
tokenizer=tokenizer,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
top_p=top_p,
top_k=top_k,
**kwargs)
def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
def _sanitize_parameters(self,
return_full_text: bool | None = None,
@@ -59,8 +46,7 @@ def get_pipeline(model: transformers.PreTrainedModel,
preprocess_params: dict[str, t.Any] = {}
# newer versions of the tokenizer configure the response key as a special token. newer versions still may
# append a newline to yield a single token. find whatever token is configured for the response key.
tokenizer_response_key = next(
(token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
response_key_token_id = None
end_key_token_id = None
if tokenizer_response_key:
@@ -84,17 +70,15 @@ def get_pipeline(model: transformers.PreTrainedModel,
inputs['instruction_text'] = input_
return t.cast(t.Dict[str, t.Any], inputs)
def _forward(self, input_tensors: dict[str, t.Any],
**generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
if t.TYPE_CHECKING: assert self.tokenizer is not None
input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
else: in_b = input_ids.shape[0]
generated_sequence = self.model.generate(
input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
pad_token_id=self.tokenizer.pad_token_id,
**generate_kwargs)
generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
pad_token_id=self.tokenizer.pad_token_id,
**generate_kwargs)
out_b = generated_sequence.shape[0]
if self.framework == 'pt':
generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
@@ -162,10 +146,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
return {
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
'torch_dtype': torch.bfloat16
}, {}
return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
@@ -176,6 +157,4 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
llm_config = self.config.model_construct_env(**attrs)
with torch.inference_mode():
return self.model(prompt,
return_full_text=llm_config.return_full_text,
generation_config=llm_config.to_generation_config())
return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())

View File

@@ -4,42 +4,31 @@ import typing as t
import openllm
if t.TYPE_CHECKING: import torch, transformers
else:
torch, transformers = openllm.utils.LazyLoader('torch', globals(),
'torch'), openllm.utils.LazyLoader('transformers', globals(),
'transformers')
torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
__openllm_internal__ = True
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
return {
'torch_dtype': torch.bfloat16,
'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None
}, {}
return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
eos_token_id, inputs = attrs.pop('eos_token_id',
self.tokenizer.eos_token_id), self.tokenizer(prompt,
return_tensors='pt').to(self.device)
eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined]
return self.tokenizer.batch_decode(self.model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
generation_config=self.config.model_construct_env(eos_token_id=eos_token_id,
**attrs).to_generation_config()),
skip_special_tokens=True)
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
'stopping_criteria', openllm.StoppingCriteriaList([]))
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'],
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -11,11 +11,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
import torch
with torch.inference_mode():
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)
def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
import torch

View File

@@ -32,10 +32,9 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
# NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
return self.tokenizer.batch_decode(self.model.generate(
self.tokenizer(prompt, return_tensors='np')['input_ids'],
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
decoder_start_token_id=decoder_start_token_id).sequences,
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='np')['input_ids'],
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
decoder_start_token_id=decoder_start_token_id).sequences,
skip_special_tokens=True,
clean_up_tokenization_spaces=True)

View File

@@ -8,8 +8,7 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo
__openllm_internal__ = True
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
return self.tokenizer.batch_decode(self.model.generate(
self.tokenizer(prompt, return_tensors='tf').input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids,
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)

View File

@@ -26,17 +26,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
num_tokens=int(torch.sum(attention_mask).item()))
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
'stopping_criteria', openllm.StoppingCriteriaList([]))
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'],
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -48,11 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
device_map = attrs.pop('device_map', None)
attrs.pop('low_cpu_mem_usage', None)
config = get_mpt_config(self.model_id,
self.config.max_sequence_length,
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code)
config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
@@ -62,10 +58,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
device_map=device_map,
**attrs)
try:
return bentoml.transformers.save_model(self.tag,
model,
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
finally:
torch.cuda.empty_cache()
@@ -79,7 +72,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
self.device,
device_map=device_map,
trust_remote_code=trust_remote_code,
)
)
model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
config=config,
trust_remote_code=trust_remote_code,

View File

@@ -16,12 +16,11 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
__openllm_internal__ = True
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
config, tokenizer = transformers.AutoConfig.from_pretrained(
self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(self.tag,
transformers.FlaxAutoModelForCausalLM.from_pretrained(
self.model_id, **attrs),
transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
@@ -45,6 +44,5 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
do_sample=True,
generation_config=self.config.model_construct_env(
**attrs).to_generation_config()).sequences,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
skip_special_tokens=True)

View File

@@ -18,8 +18,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
import torch
with torch.inference_mode():
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)

View File

@@ -11,18 +11,16 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token
def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
import transformers
config, tokenizer = transformers.AutoConfig.from_pretrained(
self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
self.model_id, **self.llm_parameters[-1])
tokenizer.pad_token_id = config.pad_token_id
return bentoml.transformers.save_model(self.tag,
transformers.TFOPTForCausalLM.from_pretrained(
self.model_id, trust_remote_code=trust_remote_code, **attrs),
transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
return self.tokenizer.batch_decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)
return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
skip_special_tokens=True)

View File

@@ -17,11 +17,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN
import torch
with torch.inference_mode():
return [
self.tokenizer.decode(
self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
skip_special_tokens=True)
self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
do_sample=True,
generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
pad_token_id=self.tokenizer.eos_token_id,
stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
skip_special_tokens=True)
]

View File

@@ -28,19 +28,10 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
import transformers
torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
tokenizer.add_special_tokens({
'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
'pad_token': EOD
})
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
torch_dtype=torch_dtype,
device_map=device_map,
**attrs)
tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
try:
return bentoml.transformers.save_model(self.tag,
model,
custom_objects={'tokenizer': tokenizer},
labels=generate_labels(self))
return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
finally:
torch.cuda.empty_cache()
@@ -49,26 +40,21 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
with torch.inference_mode():
# eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
# NOTE: support fine-tuning starcoder
result_tensor = self.model.generate(
self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
generation_config=self.config.model_construct_env(**attrs).to_generation_config())
# TODO: We will probably want to return the tokenizer here so that we can manually process this
# return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
def generate_one(self, prompt: str, stop: list[str],
**preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
'stopping_criteria', openllm.StoppingCriteriaList([]))
def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
openllm.StoppingCriteriaList([]))
stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
result = self.tokenizer.decode(
self.model.generate(encoded_inputs['input_ids'],
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq): result = result[:-len(stop_seq)]

View File

@@ -61,16 +61,13 @@ model, tokenizer = openllm.AutoLLM.for_model("falcon",
quantize="int4",
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
ensure_available=True).prepare_for_training(adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=[
"query_key_value", "dense",
"dense_h_to_4h",
"dense_4h_to_h"
])
ensure_available=True).prepare_for_training(
adapter_type="lora",
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias="none",
target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token
@@ -81,9 +78,8 @@ trainer = SFTTrainer(model=model,
dataset_text_field="text",
max_seq_length=model_args.max_sequence_length,
tokenizer=tokenizer,
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
)
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
)
# upcast layernorm in float32 for more stable training
for name, module in trainer.model.named_modules():

View File

@@ -78,10 +78,7 @@ def chunk(sample, chunk_length=2048):
batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
# Split by chunks of max_len.
result = {
k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
for k, t in concatenated_examples.items()
}
result = {k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] for k, t in concatenated_examples.items()}
# add remainder to global variable for next batch
remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
# prepare labels
@@ -101,8 +98,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])
# tokenize and chunk dataset
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]),
batched=True,
lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True,
remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)
# Print total number of samples
@@ -113,7 +109,7 @@ def prepare_for_int4_training(model_id: str,
model_version: str | None = None,
gradient_checkpointing: bool = True,
bf16: bool = True,
) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
from peft.tuners.lora import LoraLayer
llm = openllm.AutoLLM.for_model("llama",
@@ -124,16 +120,14 @@ def prepare_for_int4_training(model_id: str,
bnb_4bit_compute_dtype=torch.bfloat16,
use_cache=not gradient_checkpointing,
device_map="auto",
)
)
print("Model summary:", llm.model)
# get lora target modules
modules = find_all_linear_names(llm.model)
print(f"Found {len(modules)} modules to quantize: {modules}")
model, tokenizer = llm.prepare_for_training(adapter_type="lora",
use_gradient_checkpointing=gradient_checkpointing,
target_modules=modules)
model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules)
# pre-process the model by upcasting the layer norms in float 32 for
for name, module in model.named_modules():
@@ -189,7 +183,7 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
model, tokenizer = prepare_for_int4_training(model_args.model_id,
gradient_checkpointing=training_args.gradient_checkpointing,
bf16=training_args.bf16,
)
)
datasets = prepare_datasets(tokenizer)
trainer = transformers.Trainer(model=model,
@@ -197,7 +191,7 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
**dataclasses.asdict(training_args)),
train_dataset=datasets,
data_collator=transformers.default_data_collator,
)
)
trainer.train()
@@ -212,14 +206,10 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
del model, trainer
torch.cuda.empty_cache()
model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir,
low_cpu_mem_usage=True,
torch_dtype=torch.float16)
model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
# merge lora with base weights and save
model = model.merge_and_unload()
model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"),
safe_serialization=True,
max_shard_size="2GB")
model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB")
else:
trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))

View File

@@ -26,14 +26,12 @@ if t.TYPE_CHECKING:
DEFAULT_MODEL_ID = "facebook/opt-6.7b"
def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any,
training_args: TrainingArguments):
def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments):
return transformers.Trainer(model=model,
train_dataset=dataset_dict["train"],
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
**dataclasses.asdict(training_args)),
args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
)
@dataclasses.dataclass
class TrainingArguments:
@@ -58,16 +56,13 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
model, tokenizer = openllm.AutoLLM.for_model("opt",
model_id=model_args.model_id,
quantize="int8",
ensure_available=True).prepare_for_training(
adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8",
ensure_available=True).prepare_for_training(adapter_type="lora",
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none")
# ft on english_quotes
data = load_dataset("Abirate/english_quotes")

View File

@@ -59,14 +59,12 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
return tokenizer
class _Caller(t.Protocol[P]):
def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
...
_extras = ['get', 'import_model', 'load_model']
def _make_dispatch_function(fn: str) -> _Caller[P]:
def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
"""Generic function dispatch to correct serialisation submodules based on LLM runtime.

View File

@@ -7,6 +7,5 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
}
HUB_ATTRS = [
'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision',
'subfolder', 'use_auth_token'
'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token'
]

View File

@@ -13,11 +13,7 @@ if t.TYPE_CHECKING:
_conversion_strategy = {'pt': 'ggml'}
def import_model(llm: openllm.LLM[t.Any, t.Any],
*decls: t.Any,
trust_remote_code: bool = True,
**attrs: t.Any,
) -> bentoml.Model:
def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model:
raise NotImplementedError('Currently work in progress.')
def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:

View File

@@ -68,24 +68,18 @@ def import_model(llm: openllm.LLM[M, T],
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
_, tokenizer_attrs = llm.llm_parameters
quantize_method = llm._quantize_method
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
default=llm._serialisation_format == 'safetensors')
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
# Disable safe serialization with vLLM
if llm.__llm_backend__ == 'vllm': safe_serialisation = False
metadata: DictStrAny = {
'safe_serialisation': safe_serialisation,
'_quantize': quantize_method is not None and quantize_method
}
metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
signatures: DictStrAny = {}
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
signatures['generate'] = {'batchable': False}
else:
# this model might be called with --quantize int4, therefore we need to pop this out
@@ -95,10 +89,7 @@ def import_model(llm: openllm.LLM[M, T],
if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
trust_remote_code=trust_remote_code,
**hub_attrs,
**tokenizer_attrs)
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)]
@@ -117,25 +108,18 @@ def import_model(llm: openllm.LLM[M, T],
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig',
llm.quantization_config),
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
trust_remote_code=trust_remote_code,
use_safetensors=safe_serialisation,
**hub_attrs,
**attrs)
update_model(bentomodel,
metadata={
'_pretrained_class': model.__class__.__name__,
'_framework': model.model.framework
})
update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
else:
architectures = getattr(config, 'architectures', [])
@@ -159,18 +143,14 @@ def import_model(llm: openllm.LLM[M, T],
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
else:
# we will clone the all tings into the bentomodel path without loading model into memory
snapshot_download(llm.model_id,
local_dir=bentomodel.path,
local_dir_use_symlinks=False,
ignore_patterns=HfIgnore.ignore_patterns(llm))
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
except Exception:
raise
else:
bentomodel.flush() # type: ignore[no-untyped-call]
bentomodel.save(_model_store)
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module,
model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
finally:
bentomodel.exit_cloudpickle_context(imported_modules)
# NOTE: We need to free up the cache after importing the model
@@ -189,36 +169,29 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
try:
model = bentoml.models.get(llm.tag)
if Version(model.info.api_version) < Version('v2'):
raise openllm.exceptions.OpenLLMException(
'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
if model.info.labels['backend'] != llm.__llm_backend__:
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}."
)
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
return model
except Exception as err:
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
raise openllm.exceptions.OpenLLMException(
f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
raise openllm.exceptions.OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
safe_serialization = openllm.utils.first_not_none(t.cast(
t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
attrs.pop('safe_serialization', None),
default=llm._serialisation_format == 'safetensors')
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
)
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(
f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig',
llm.quantization_config),
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
trust_remote_code=llm.trust_remote_code,
use_safetensors=safe_serialization,
**hub_attrs,

View File

@@ -24,13 +24,11 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import T
else:
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
'transformers'), openllm_core.utils.LazyLoader(
'torch', globals(), 'torch')
'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
_object_setattr = object.__setattr__
def process_config(model_id: str, trust_remote_code: bool,
**attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
'''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.
Args:
@@ -55,8 +53,7 @@ def process_config(model_id: str, trust_remote_code: bool,
return config, hub_attrs, attrs
def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
__cls = getattr(transformers,
openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
__cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
if __cls is None:
raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
return __cls
@@ -105,13 +102,11 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)
default_config = ModelSignature(batchable=False)
if llm.__llm_backend__ in {'pt', 'vllm'}:
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
'group_beam_search', 'constrained_beam_search',
)
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search',
'constrained_beam_search',
)
elif llm.__llm_backend__ == 'tf':
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
'contrastive_search',
)
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
else:
infer_fn += ('generate',)
return {k: default_config for k in infer_fn}

View File

@@ -27,10 +27,7 @@ def build_bento(model: str,
bentoml.bentos.delete(bento.tag)
@contextlib.contextmanager
def build_container(bento: bentoml.Bento | str | bentoml.Tag,
image_tag: str | None = None,
cleanup: bool = False,
**attrs: t.Any) -> t.Iterator[str]:
def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]:
if isinstance(bento, bentoml.Bento): bento_tag = bento.tag
else: bento_tag = bentoml.Tag.from_taglike(bento)
if image_tag is None: image_tag = str(bento_tag)

View File

@@ -27,8 +27,7 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
'serialisation_format': llm._serialisation_format
}
def infer_auto_class(
backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
import openllm
if backend == 'tf': return openllm.AutoTFLLM
elif backend == 'flax': return openllm.AutoFlaxLLM
@@ -36,10 +35,7 @@ def infer_auto_class(
elif backend == 'vllm': return openllm.AutoVLLM
else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')")
__all__ = [
'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
'dummy_vllm_objects'
]
__all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects']
def __dir__() -> t.Sequence[str]:
return sorted(__all__)

View File

@@ -16,39 +16,26 @@ env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_na
def model_settings(draw: st.DrawFn):
'''Strategy for generating ModelSettings objects.'''
kwargs: dict[str, t.Any] = {
'default_id':
st.text(min_size=1),
'model_ids':
st.lists(st.text(), min_size=1),
'architecture':
st.text(min_size=1),
'url':
st.text(),
'requires_gpu':
st.booleans(),
'trust_remote_code':
st.booleans(),
'requirements':
st.none() | st.lists(st.text(), min_size=1),
'default_backend':
st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
'model_type':
st.sampled_from(['causal_lm', 'seq2seq_lm']),
'name_type':
st.sampled_from(['dasherize', 'lowercase']),
'timeout':
st.integers(min_value=3600),
'workers_per_resource':
st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
'default_id': st.text(min_size=1),
'model_ids': st.lists(st.text(), min_size=1),
'architecture': st.text(min_size=1),
'url': st.text(),
'requires_gpu': st.booleans(),
'trust_remote_code': st.booleans(),
'requirements': st.none() | st.lists(st.text(), min_size=1),
'default_backend': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
'model_type': st.sampled_from(['causal_lm', 'seq2seq_lm']),
'name_type': st.sampled_from(['dasherize', 'lowercase']),
'timeout': st.integers(min_value=3600),
'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
}
return draw(st.builds(ModelSettings, **kwargs))
def make_llm_config(
cls_name: str,
dunder_config: dict[str, t.Any] | ModelSettings,
fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None,
generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None,
) -> type[openllm.LLMConfig]:
def make_llm_config(cls_name: str,
dunder_config: dict[str, t.Any] | ModelSettings,
fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None,
generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None,
) -> type[openllm.LLMConfig]:
globs: dict[str, t.Any] = {'openllm': openllm}
_config_args: list[str] = []
lines: list[str] = [f'class {cls_name}Config(openllm.LLMConfig):']

View File

@@ -24,21 +24,19 @@ from ._strategies._configuration import make_llm_config
from ._strategies._configuration import model_settings
# XXX: @aarnphm fixes TypedDict behaviour in 3.11
@pytest.mark.skipif(sys.version_info[:2] == (3, 11),
reason='TypedDict in 3.11 behaves differently, so we need to fix this')
@pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this')
def test_missing_default():
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingDefaultId', {'name_type': 'lowercase', 'requirements': ['bentoml']})
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
with pytest.raises(ValueError, match='Missing required fields *'):
make_llm_config(
'MissingArchitecture', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing'],
'requirements': ['bentoml'],
},
)
make_llm_config('MissingArchitecture', {
'default_id': 'huggingface/t5-tiny-testing',
'model_ids': ['huggingface/t5-tiny-testing'],
'requirements': ['bentoml'],
},
)
def test_forbidden_access():
cl_ = make_llm_config(
@@ -79,16 +77,11 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
assert attr.has(cl_)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
)
def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int,
input_temperature: float):
cl_ = make_llm_config('ComplexLLM',
gen_settings,
fields=(('field1', 'float', field1),),
generation_fields=(('temperature', temperature),),
)
@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473),
st.floats(min_value=0.0, max_value=1.0),
)
def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
sent = cl_()
assert sent.model_dump()['field1'] == field1
assert sent.model_dump()['generation_config']['temperature'] == temperature
@@ -129,7 +122,6 @@ def test_struct_envvar():
assert overwrite_default['temperature'] == 0.2
def test_struct_provided_fields():
class EnvLLM(openllm.LLMConfig):
__config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
field1: int = 2
@@ -151,7 +143,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
'architecture': 'PreTrainedModel'
},
fields=(('field1', 'float', 3.0),),
).model_construct_env(field1=20.0, temperature=0.4)
).model_construct_env(field1=20.0, temperature=0.4)
assert sent.generation_config.temperature == 0.4
assert sent.field1 == 20.0

View File

@@ -10,35 +10,22 @@ import openllm
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralBackend
_MODELING_MAPPING = {
'flan_t5': 'google/flan-t5-small',
'opt': 'facebook/opt-125m',
'baichuan': 'baichuan-inc/Baichuan-7B',
}
_PROMPT_MAPPING = {
'qa':
'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',
}
_MODELING_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',}
_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',}
def parametrise_local_llm(
model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
backends: tuple[LiteralBackend, ...] = tuple()
if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',)
if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',)
if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',)
for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()):
yield prompt, openllm.Runner(model,
model_id=_MODELING_MAPPING[model],
ensure_available=True,
backend=backend,
init_local=True)
yield prompt, openllm.Runner(model, model_id=_MODELING_MAPPING[model], ensure_available=True, backend=backend, init_local=True)
def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
if os.getenv('GITHUB_ACTIONS') is None:
if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
metafunc.parametrize('prompt,llm',
[(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
# If no tests are collected, pytest exists with code 5, which makes the CI fail.

View File

@@ -40,13 +40,7 @@ if t.TYPE_CHECKING:
from openllm.client import BaseAsyncClient
class ResponseComparator(JSONSnapshotExtension):
def serialize(self,
data: SerializableData,
*,
exclude: PropertyFilter | None = None,
matcher: PropertyMatcher | None = None,
) -> SerializedData:
def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData:
if openllm.utils.LazyType(ListAny).isinstance(data):
data = [d.unmarshaled for d in data]
else:
@@ -55,7 +49,6 @@ class ResponseComparator(JSONSnapshotExtension):
return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()
def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool:
def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]:
try:
data = orjson.loads(data)
@@ -83,8 +76,7 @@ class ResponseComparator(JSONSnapshotExtension):
return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and
eq_config(s.marshaled_config, t.marshaled_config))
return len(serialized_data) == len(snapshot_data) and all(
[eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
@pytest.fixture()
def response_snapshot(snapshot: SnapshotAssertion):
@@ -133,14 +125,8 @@ class LocalHandle(_Handle):
return self.process.poll() is None
class HandleProtocol(t.Protocol):
@contextlib.contextmanager
def __call__(*,
model: str,
model_id: str,
image_tag: str,
quantize: t.AnyStr | None = None,
) -> t.Generator[_Handle, None, None]:
def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]:
...
@attr.define(init=False)
@@ -148,9 +134,7 @@ class DockerHandle(_Handle):
container_name: str
docker_client: docker.DockerClient
def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int,
deployment_mode: t.Literal['container', 'local'],
):
def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int, deployment_mode: t.Literal['container', 'local'],):
self.__attrs_init__(port, deployment_mode, container_name, docker_client)
def status(self) -> bool:
@@ -165,22 +149,14 @@ def _local_handle(model: str,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
):
with openllm.utils.reserve_free_port() as port:
pass
if not _serve_grpc:
proc = openllm.start(model,
model_id=model_id,
quantize=quantize,
additional_args=['--port', str(port)],
__test__=True)
proc = openllm.start(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
else:
proc = openllm.start_grpc(model,
model_id=model_id,
quantize=quantize,
additional_args=['--port', str(port)],
__test__=True)
proc = openllm.start_grpc(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
yield LocalHandle(proc, port, deployment_mode)
proc.terminate()
@@ -201,7 +177,7 @@ def _container_handle(model: str,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
):
envvar = openllm.utils.EnvVarMixin(model)
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
@@ -237,7 +213,7 @@ def _container_handle(model: str,
'3000/tcp': port,
'3001/tcp': prom_port
},
)
)
yield DockerHandle(client, container.name, port, deployment_mode)

View File

@@ -16,11 +16,8 @@ model = 'flan_t5'
model_id = 'google/flan-t5-small'
@pytest.fixture(scope='module')
def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
clean_context: contextlib.ExitStack,
):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
clean_context=clean_context) as image_tag:
def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
yield handle

View File

@@ -16,11 +16,8 @@ model = 'opt'
model_id = 'facebook/opt-125m'
@pytest.fixture(scope='module')
def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
clean_context: contextlib.ExitStack,
):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
clean_context=clean_context) as image_tag:
def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
yield handle

View File

@@ -15,11 +15,10 @@ if t.TYPE_CHECKING:
HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5'
actions_xfail = functools.partial(
pytest.mark.xfail,
condition=os.getenv('GITHUB_ACTIONS') is not None,
reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
)
actions_xfail = functools.partial(pytest.mark.xfail,
condition=os.getenv('GITHUB_ACTIONS') is not None,
reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
)
@actions_xfail
def test_general_build_with_internal_testing():
@@ -51,8 +50,7 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
file.write_text(
"{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}"
)
"{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
return file
@pytest.mark.usefixtures('dockerfile_template')

View File

@@ -71,11 +71,9 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
mcls.setenv('CUDA_VISIBLE_DEVICES', '')
assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],
).match('Input list should be all string type.')
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.')
assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
assert pytest.raises(ValueError, NvidiaGpuResource.validate,
['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as mcls: