chore: ignore new lines split [skip ci]

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
aarnphm-ec2-dev
2023-09-01 17:00:49 +00:00
parent 608de0b667
commit 7d893e6cd2
70 changed files with 575 additions and 950 deletions

View File

@@ -43,13 +43,10 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
try:
tokenizer = cloudpickle.load(t.cast('t.IO[bytes]', cofile))['tokenizer']
except KeyError:
raise openllm.exceptions.OpenLLMException(
"Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
"For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
else:
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
trust_remote_code=llm.trust_remote_code,
**tokenizer_attrs)
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
if tokenizer.pad_token_id is None:
if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id

View File

@@ -6,6 +6,4 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
'flax': ('FlaxAutoModelForCausalLM', 'FlaxAutoModelForSeq2SeqLM'),
'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
}
HUB_ATTRS = [
'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token'
]
HUB_ATTRS = ['cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token']

View File

@@ -43,11 +43,7 @@ logger = logging.getLogger(__name__)
__all__ = ['import_model', 'get', 'load_model']
@inject
def import_model(llm: openllm.LLM[M, T],
*decls: t.Any,
trust_remote_code: bool,
_model_store: ModelStore = Provide[BentoMLContainer.model_store],
**attrs: t.Any) -> bentoml.Model:
def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model:
"""Auto detect model type from given model_id and import it to bentoml's model store.
For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
@@ -76,8 +72,7 @@ def import_model(llm: openllm.LLM[M, T],
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
signatures['generate'] = {'batchable': False}
@@ -107,8 +102,7 @@ def import_model(llm: openllm.LLM[M, T],
tokenizer.save_pretrained(bentomodel.path)
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
@@ -124,20 +118,13 @@ def import_model(llm: openllm.LLM[M, T],
else:
architectures = getattr(config, 'architectures', [])
if not architectures:
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
architecture = architectures[0]
update_model(bentomodel, metadata={'_pretrained_class': architecture})
if llm._local:
# possible local path
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id,
*decls,
config=config,
trust_remote_code=trust_remote_code,
**hub_attrs,
**attrs)
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
@@ -149,8 +136,7 @@ def import_model(llm: openllm.LLM[M, T],
else:
bentomodel.flush() # type: ignore[no-untyped-call]
bentomodel.save(_model_store)
openllm.utils.analytics.track(
openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
openllm.utils.analytics.track(openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
finally:
bentomodel.exit_cloudpickle_context(imported_modules)
# NOTE: We need to free up the cache after importing the model
@@ -171,8 +157,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
if Version(model.info.api_version) < Version('v2'):
raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
if model.info.labels['backend'] != llm.__llm_backend__:
raise openllm.exceptions.OpenLLMException(
f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
return model
except Exception as err:
if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
@@ -185,8 +170,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
default=llm._serialisation_format == 'safetensors')
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,

View File

@@ -23,8 +23,7 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
else:
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
_object_setattr = object.__setattr__
@@ -45,11 +44,7 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
if not isinstance(config, transformers.PretrainedConfig):
copied_attrs = copy.deepcopy(attrs)
if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype')
config, attrs = transformers.AutoConfig.from_pretrained(model_id,
return_unused_kwargs=True,
trust_remote_code=trust_remote_code,
**hub_attrs,
**copied_attrs)
config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs)
return config, hub_attrs, attrs
def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
@@ -62,9 +57,7 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
if llm.config['trust_remote_code']:
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if not hasattr(config, 'auto_map'):
raise ValueError(
f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping'
)
raise ValueError(f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping')
# in case this model doesn't use the correct auto class for model type, for example like chatglm
# where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
if autoclass not in config.auto_map: autoclass = 'AutoModel'
@@ -84,7 +77,8 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
based.update(metadata)
_object_setattr(
bentomodel, '_info',
bentomodel,
'_info',
ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
tag=bentomodel.info.tag,
module=bentomodel.info.module,
@@ -102,9 +96,7 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)
default_config = ModelSignature(batchable=False)
if llm.__llm_backend__ in {'pt', 'vllm'}:
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search',
'constrained_beam_search',
)
infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search',)
elif llm.__llm_backend__ == 'tf':
infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
else: