perf: improve build logics and cleanup speed (#657)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-06-12 18:39:16 -04:00 · 2023-11-15 00:18:31 -05:00
parent 103156cd71
commit a58d947bc8
11 changed files with 141 additions and 237 deletions
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -4,8 +4,8 @@ import importlib
 import cloudpickle
 import fs

-import openllm
 from openllm_core._typing_compat import ParamSpec
+from openllm_core.exceptions import OpenLLMException

 P = ParamSpec('P')

@@ -31,7 +31,7 @@ def load_tokenizer(llm, **tokenizer_attrs):
      try:
        tokenizer = cloudpickle.load(cofile)['tokenizer']
      except KeyError:
-        raise openllm.exceptions.OpenLLMException(
+        raise OpenLLMException(
          "Bento model does not have tokenizer. Make sure to save the tokenizer within the model via 'custom_objects'. "
          'For example: "bentoml.transformers.save_model(..., custom_objects={\'tokenizer\': tokenizer})"'
        ) from None
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -13,7 +13,7 @@ import bentoml
 import openllm
 from bentoml._internal.configuration.containers import BentoMLContainer
 from bentoml._internal.models.model import ModelOptions, ModelSignature
-from openllm_core._typing_compat import M, T
+from openllm_core.exceptions import OpenLLMException

 from ._helpers import get_hash, infer_autoclass_from_llm, process_config
 from .weights import HfIgnore
@@ -24,7 +24,7 @@ __all__ = ['import_model', 'get', 'load_model']
 _object_setattr = object.__setattr__


-def _patch_correct_tag(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, _revision: str | None = None):
+def _patch_correct_tag(llm, config, _revision=None):
  # NOTE: The following won't hit during local since we generated a correct version based on local path hash It will only hit if we use model from HF Hub
  if llm.revision is not None:
    return
@@ -76,7 +76,7 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon

  if quantize == 'gptq':
    if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
-      raise openllm.exceptions.OpenLLMException(
+      raise OpenLLMException(
        "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
      )
    signatures['generate'] = {'batchable': False}
@@ -175,7 +175,7 @@ def get(llm):
    model = bentoml.models.get(llm.tag)
    backend = model.info.labels['backend']
    if backend != llm.__llm_backend__:
-      raise openllm.exceptions.OpenLLMException(
+      raise OpenLLMException(
        f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
      )
    _patch_correct_tag(
@@ -185,9 +185,7 @@ def get(llm):
    )
    return model
  except Exception as err:
-    raise openllm.exceptions.OpenLLMException(
-      f'Failed while getting stored artefact (lookup for traceback):\n{err}'
-    ) from err
+    raise OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err


 def check_unintialised_params(model):
@@ -216,13 +214,11 @@ def load_model(llm, *decls, **attrs):
    _quantise = llm.bentomodel.info.metadata['_quantize']
    if _quantise == 'gptq':
      if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
-        raise openllm.exceptions.OpenLLMException(
+        raise OpenLLMException(
          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
        )
      if llm.config['model_type'] != 'causal_lm':
-        raise openllm.exceptions.OpenLLMException(
-          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})"
-        )
+        raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")

    # TODO: investigate load with flash attention
    model = auto_class.from_pretrained(
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -1,17 +1,11 @@
 from __future__ import annotations
 import copy
-import typing as t

 import transformers

 from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING, HUB_ATTRS
 from openllm_core.exceptions import OpenLLMException

-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import M, T
-
-  from ..._llm import LLM
-

 def get_hash(config) -> str:
  _commit_hash = getattr(config, '_commit_hash', None)
@@ -34,7 +28,7 @@ def process_config(model_id, trust_remote_code, **attrs):
  return config, hub_attrs, attrs


-def infer_autoclass_from_llm(llm: LLM[M, T], config, /):
+def infer_autoclass_from_llm(llm, config, /):
  if llm.trust_remote_code:
    autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
    if not hasattr(config, 'auto_map'):
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -13,7 +13,6 @@ if t.TYPE_CHECKING:
  from huggingface_hub.hf_api import ModelInfo as HfModelInfo

  import openllm
-  from openllm_core._typing_compat import M, T

 __global_inst__ = None
 __cached_id__: dict[str, HfModelInfo] = dict()
@@ -39,7 +38,7 @@ def ModelInfo(model_id: str, revision: str | None = None) -> HfModelInfo:

 def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool:
  if validate_is_path(model_id):
-    return next((True for item in Path(resolve_filepath(model_id)).glob('*.safetensors')), False)
+    return next((True for _ in Path(resolve_filepath(model_id)).glob('*.safetensors')), False)
  return any(s.rfilename.endswith('.safetensors') for s in ModelInfo(model_id, revision=revision).siblings)


@@ -52,7 +51,7 @@ class HfIgnore:
  gguf = '*.gguf'

  @classmethod
-  def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
+  def ignore_patterns(cls, llm: openllm.LLM[t.Any, t.Any]) -> list[str]:
    if llm.__llm_backend__ in {'vllm', 'pt'}:
      base = [cls.tf, cls.flax, cls.gguf]
      if has_safetensors_weights(llm.model_id):