feat(ctranslate): initial infrastructure support (#694)

* perf: compact and improve speed and agility Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * --wip-- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: cleanup infrastructure Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update styles notes and autogen mypy configuration Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-04-29 03:13:44 -04:00 · 2023-11-19 01:48:33 -05:00
parent 93ffb29e9f
commit 206521e02d
38 changed files with 507 additions and 642 deletions
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -42,6 +42,7 @@ from openllm_core.utils import (
  generate_hash_from_file,
  get_disable_warnings,
  get_quiet_mode,
+  getenv,
  is_peft_available,
  is_vllm_available,
  resolve_filepath,
@@ -52,6 +53,7 @@ from .exceptions import ForbiddenAttributeError, OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME

 if t.TYPE_CHECKING:
+  import torch
  import transformers
  from peft.config import PeftConfig

@@ -109,8 +111,8 @@ def _torch_dtype_mapping():

  return {
    'half': torch.float16,
-    'float16': torch.float16,
    'float': torch.float32,
+    'float16': torch.float16,
    'float32': torch.float32,
    'bfloat16': torch.bfloat16,
  }
@@ -132,7 +134,8 @@ class LLM(t.Generic[M, T], ReprMixin):
  _prompt_template: PromptTemplate | None
  _system_message: str | None

-  __llm_torch_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
+  __llm_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
+  __llm_torch_dtype__: 'torch.dtype' = None
  __llm_config__: LLMConfig | None = None
  __llm_backend__: LiteralBackend = None  # type: ignore
  __llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
@@ -158,16 +161,23 @@ class LLM(t.Generic[M, T], ReprMixin):
    serialisation='safetensors',
    trust_remote_code=False,
    embedded=False,
-    torch_dtype='auto',
+    dtype='auto',
    low_cpu_mem_usage=True,
    **attrs,
  ):
+    # backward compatible
+    torch_dtype = attrs.pop('torch_dtype', None)
+    if torch_dtype is not None:
+      logger.warning(
+        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.'
+      )
+      dtype = torch_dtype
    _local = False
    if validate_is_path(model_id):
      model_id, _local = resolve_filepath(model_id), True
    backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
-    torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto')
-    quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None)
+    dtype = first_not_none(getenv('dtype', default=dtype, var=['TORCH_DTYPE']), default='auto')
+    quantize = first_not_none(getenv('quantize', default=quantize, var=['QUANITSE']), default=None)
    attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage})
    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
    model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
@@ -189,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
      system_message=system_message,
      LLM__model_attrs=model_attrs,
      LLM__tokenizer_attrs=tokenizer_attrs,
-      llm_torch_dtype__=torch_dtype.lower(),
+      llm_dtype__=torch_dtype.lower(),
      llm_backend__=backend,
      llm_config__=llm_config,
      llm_trust_remote_code__=trust_remote_code,
@@ -222,15 +232,15 @@ class LLM(t.Generic[M, T], ReprMixin):
      config_dtype = getattr(hf_config, 'torch_dtype', None)
      if config_dtype is None:
        config_dtype = torch.float32
-      if self.__llm_torch_dtype__ == 'auto':
+      if self.__llm_dtype__ == 'auto':
        if config_dtype == torch.float32:
          torch_dtype = torch.float16  # following common practice
        else:
          torch_dtype = config_dtype
      else:
-        if self.__llm_torch_dtype__ not in _torch_dtype_mapping():
-          raise ValueError(f"Unknown dtype '{self.__llm_torch_dtype__}'")
-        torch_dtype = _torch_dtype_mapping()[self.__llm_torch_dtype__]
+        if self.__llm_dtype__ not in _torch_dtype_mapping():
+          raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
+        torch_dtype = _torch_dtype_mapping()[self.__llm_dtype__]
      self.__llm_torch_dtype__ = torch_dtype
    return self.__llm_torch_dtype__

--- a/openllm-python/src/openllm/_llm.pyi
+++ b/openllm-python/src/openllm/_llm.pyi
@@ -32,7 +32,8 @@ class IdentifyingParams(TypedDict):
  model_id: str

 ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
-Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']]
+CTranslateDtype = Literal['int8_float32', 'int8_float16', 'int8_bfloat16']
+Dtype = Union[LiteralDtype, CTranslateDtype, Literal['auto', 'half', 'float']]

@attr.define(slots=True, repr=False, init=False)
 class LLM(Generic[M, T]):
@@ -50,7 +51,8 @@ class LLM(Generic[M, T]):
  _prompt_template: Optional[PromptTemplate]
  _system_message: Optional[str]

-  __llm_torch_dtype__: Dtype = ...
+  __llm_dtype__: Dtype = ...
+  __llm_torch_dtype__: Optional[torch.dtype] = ...
  __llm_config__: Optional[LLMConfig] = ...
  __llm_backend__: LiteralBackend = ...
  __llm_quantization_config__: Optional[QuantizationConfig] = ...
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,12 +1,7 @@
 from __future__ import annotations

 from openllm_core.exceptions import MissingDependencyError
-from openllm_core.utils import (
-  is_autoawq_available,
-  is_autogptq_available,
-  is_bitsandbytes_available,
-  is_optimum_supports_gptq,
-)
+from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available


 def infer_quantisation_config(llm, quantise, **attrs):
@@ -98,7 +93,7 @@ def infer_quantisation_config(llm, quantise, **attrs):
  elif quantise == 'int4':
    quantisation_config = create_int4_config()
  elif quantise == 'gptq':
-    if not is_autogptq_available() or not is_optimum_supports_gptq():
+    if not is_autogptq_available():
      raise MissingDependencyError(
        "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
      )
--- a/openllm-python/src/openllm/protocol/init.py
+++ b/openllm-python/src/openllm/protocol/init.py
@@ -1,18 +1,13 @@
-"""Protocol-related packages for all library integrations.
-
-Currently support OpenAI compatible API.
-"""
-
 from __future__ import annotations
 import os
 import typing as t

 from openllm_core.utils import LazyModule

-_import_structure: dict[str, list[str]] = {'openai': []}
+_import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}

 if t.TYPE_CHECKING:
-  from . import openai as openai
+  from . import cohere as cohere, hf as hf, openai as openai

 __lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
 __all__ = __lazy.__all__
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -1,21 +1,26 @@
 from __future__ import annotations
 import importlib
+import typing as t

-import cloudpickle
-import fs
-
-from openllm_core._typing_compat import ParamSpec
+from openllm_core._typing_compat import M, ParamSpec, T, TypeGuard
 from openllm_core.exceptions import OpenLLMException

+if t.TYPE_CHECKING:
+  from bentoml import Model
+
+  from .._llm import LLM
+
 P = ParamSpec('P')


-def load_tokenizer(llm, **tokenizer_attrs):
+def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
  """Load the tokenizer from BentoML store.

  By default, it will try to find the bentomodel whether it is in store..
  If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
  """
+  import cloudpickle
+  import fs
  from transformers import AutoTokenizer

  tokenizer_attrs = {**llm.llm_parameters[-1], **tokenizer_attrs}
@@ -52,34 +57,39 @@ def load_tokenizer(llm, **tokenizer_attrs):
  return tokenizer


-_extras = ['get', 'import_model', 'load_model']
-
-
 def _make_dispatch_function(fn):
-  def caller(llm, *args, **kwargs):
+  def caller(llm: LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> TypeGuard[M | T | Model]:
    """Generic function dispatch to correct serialisation submodules based on LLM runtime.

    > [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "vllm")'

    > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
+
+    > [!NOTE] See 'openllm.serialisation.ctranslate' if 'llm.__llm_backend__="ctranslate"'
    """
-    serde = 'transformers'
    if llm.__llm_backend__ == 'ggml':
      serde = 'ggml'
-    return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs)
+    elif llm.__llm_backend__ == 'ctranslate':
+      serde = 'ctranslate'
+    elif llm.__llm_backend__ in {'pt', 'vllm'}:
+      serde = 'transformers'
+    else:
+      raise OpenLLMException(f'Not supported backend {llm.__llm_backend__}')
+    return getattr(importlib.import_module(f'.{serde}', 'openllm.serialisation'), fn)(llm, *args, **kwargs)

  return caller


-_import_structure: dict[str, list[str]] = {'ggml': [], 'transformers': [], 'constants': []}
-__all__ = ['ggml', 'transformers', 'constants', 'load_tokenizer', *_extras]
+_extras = ['get', 'import_model', 'load_model']
+_import_structure = {'ggml', 'transformers', 'ctranslate', 'constants'}
+__all__ = ['load_tokenizer', *_extras, *_import_structure]


-def __dir__():
+def __dir__() -> t.Sequence[str]:
  return sorted(__all__)


-def __getattr__(name):
+def __getattr__(name: str) -> t.Any:
  if name == 'load_tokenizer':
    return load_tokenizer
  elif name in _import_structure:
--- a/openllm-python/src/openllm/serialisation/ctranslate/init.py
+++ b/openllm-python/src/openllm/serialisation/ctranslate/init.py
--- a/openllm-python/src/openllm/serialisation/ggml/init.py
+++ b/openllm-python/src/openllm/serialisation/ggml/init.py
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -63,9 +63,22 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon
    metadata['_quantize'] = quantize
  architectures = getattr(config, 'architectures', [])
  if not architectures:
-    raise RuntimeError(
-      'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
-    )
+    if trust_remote_code:
+      auto_map = getattr(config, 'auto_map', {})
+      if not auto_map:
+        raise RuntimeError(
+          f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}'
+        )
+      autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
+      if autoclass not in auto_map:
+        raise RuntimeError(
+          f"Given model '{llm.model_id}' is yet to be supported with 'auto_map'. OpenLLM currently only support encoder-decoders or decoders only models."
+        )
+      architectures = [auto_map[autoclass]]
+    else:
+      raise RuntimeError(
+        'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
+      )
  metadata['_pretrained_class'] = architectures[0]
  if not llm._local:
    metadata['_revision'] = get_hash(config)
@@ -75,7 +88,7 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon
  signatures = {}

  if quantize == 'gptq':
-    if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
+    if not openllm.utils.is_autogptq_available():
      raise OpenLLMException(
        "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
      )
@@ -213,7 +226,7 @@ def load_model(llm, *decls, **attrs):
  if '_quantize' in llm.bentomodel.info.metadata:
    _quantise = llm.bentomodel.info.metadata['_quantize']
    if _quantise == 'gptq':
-      if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
+      if not openllm.utils.is_autogptq_available():
        raise OpenLLMException(
          "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
        )
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import copy
 import logging
+import typing as t

 import transformers

@@ -10,14 +11,14 @@ from openllm_core.utils import get_disable_warnings, get_quiet_mode
 logger = logging.getLogger(__name__)


-def get_hash(config) -> str:
+def get_hash(config: transformers.PretrainedConfig) -> str:
  _commit_hash = getattr(config, '_commit_hash', None)
  if _commit_hash is None:
    raise ValueError(f'Cannot find commit hash in {config}')
  return _commit_hash


-def process_config(model_id, trust_remote_code, **attrs):
+def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any):
  config = attrs.pop('config', None)
  # this logic below is synonymous to handling `from_pretrained` attrs.
  hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs}
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
@@ -1,74 +1,10 @@
-"""Utilities function for OpenLLM.
-
-User can import these function for convenience, but
-we won't ensure backward compatibility for these functions. So use with caution.
-"""
-
-from __future__ import annotations
 import functools
 import importlib.metadata
-import typing as t

 import openllm_core

-if t.TYPE_CHECKING:
-  import openllm
-  from openllm_core.utils import (
-    DEBUG as DEBUG,
-    DEBUG_ENV_VAR as DEBUG_ENV_VAR,
-    DEV_DEBUG_VAR as DEV_DEBUG_VAR,
-    ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
-    MYPY as MYPY,
-    OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
-    QUIET_ENV_VAR as QUIET_ENV_VAR,
-    SHOW_CODEGEN as SHOW_CODEGEN,
-    LazyLoader as LazyLoader,
-    LazyModule as LazyModule,
-    ReprMixin as ReprMixin,
-    VersionInfo as VersionInfo,
-    analytics as analytics,
-    calc_dir_size as calc_dir_size,
-    check_bool_env as check_bool_env,
-    codegen as codegen,
-    configure_logging as configure_logging,
-    dantic as dantic,
-    field_env_key as field_env_key,
-    first_not_none as first_not_none,
-    flatten_attrs as flatten_attrs,
-    gen_random_uuid as gen_random_uuid,
-    generate_context as generate_context,
-    generate_hash_from_file as generate_hash_from_file,
-    get_debug_mode as get_debug_mode,
-    get_disable_warnings as get_disable_warnings,
-    get_quiet_mode as get_quiet_mode,
-    in_notebook as in_notebook,
-    is_autoawq_available as is_autoawq_available,
-    is_autogptq_available as is_autogptq_available,
-    is_bentoml_available as is_bentoml_available,
-    is_bitsandbytes_available as is_bitsandbytes_available,
-    is_grpc_available as is_grpc_available,
-    is_jupyter_available as is_jupyter_available,
-    is_jupytext_available as is_jupytext_available,
-    is_notebook_available as is_notebook_available,
-    is_optimum_supports_gptq as is_optimum_supports_gptq,
-    is_peft_available as is_peft_available,
-    is_torch_available as is_torch_available,
-    is_transformers_available as is_transformers_available,
-    is_vllm_available as is_vllm_available,
-    lenient_issubclass as lenient_issubclass,
-    reserve_free_port as reserve_free_port,
-    resolve_filepath as resolve_filepath,
-    resolve_user_filepath as resolve_user_filepath,
-    serde as serde,
-    set_debug_mode as set_debug_mode,
-    set_disable_warnings as set_disable_warnings,
-    set_quiet_mode as set_quiet_mode,
-    validate_is_path as validate_is_path,
-  )
-  from openllm_core.utils.serde import converter as converter

-
-def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
+def generate_labels(llm):
  return {
    'backend': llm.__llm_backend__,
    'framework': 'openllm',
@@ -79,27 +15,26 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
  }


-def available_devices() -> tuple[str, ...]:
-  """Return available GPU under system. Currently only supports NVIDIA GPUs."""
+def available_devices():
  from .._strategies import NvidiaGpuResource

  return tuple(NvidiaGpuResource.from_system())


@functools.lru_cache(maxsize=1)
-def device_count() -> int:
+def device_count():
  return len(available_devices())


 __all__ = ['generate_labels', 'available_devices', 'device_count']


-def __dir__() -> t.Sequence[str]:
-  return sorted(__all__) + sorted(dir(openllm_core.utils))
+def __dir__():
+  coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
+  return sorted(__all__) + sorted(list(coreutils))


-def __getattr__(it: str) -> t.Any:
+def __getattr__(it):
  if hasattr(openllm_core.utils, it):
    return getattr(openllm_core.utils, it)
-  else:
-    raise AttributeError(f'module {__name__} has no attribute {it}')
+  raise AttributeError(f'module {__name__} has no attribute {it}')
--- a/openllm-python/src/openllm/utils/init.pyi
+++ b/openllm-python/src/openllm/utils/init.pyi
@@ -0,0 +1,61 @@
+from typing import Any, Dict, Tuple
+
+from openllm_core.utils import (
+  DEBUG as DEBUG,
+  DEBUG_ENV_VAR as DEBUG_ENV_VAR,
+  DEV_DEBUG_VAR as DEV_DEBUG_VAR,
+  ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
+  MYPY as MYPY,
+  OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
+  QUIET_ENV_VAR as QUIET_ENV_VAR,
+  SHOW_CODEGEN as SHOW_CODEGEN,
+  LazyLoader as LazyLoader,
+  LazyModule as LazyModule,
+  ReprMixin as ReprMixin,
+  VersionInfo as VersionInfo,
+  analytics as analytics,
+  calc_dir_size as calc_dir_size,
+  check_bool_env as check_bool_env,
+  codegen as codegen,
+  configure_logging as configure_logging,
+  dantic as dantic,
+  field_env_key as field_env_key,
+  first_not_none as first_not_none,
+  flatten_attrs as flatten_attrs,
+  gen_random_uuid as gen_random_uuid,
+  generate_context as generate_context,
+  generate_hash_from_file as generate_hash_from_file,
+  get_debug_mode as get_debug_mode,
+  get_disable_warnings as get_disable_warnings,
+  get_quiet_mode as get_quiet_mode,
+  getenv as getenv,
+  in_notebook as in_notebook,
+  is_autoawq_available as is_autoawq_available,
+  is_autogptq_available as is_autogptq_available,
+  is_bentoml_available as is_bentoml_available,
+  is_bitsandbytes_available as is_bitsandbytes_available,
+  is_grpc_available as is_grpc_available,
+  is_jupyter_available as is_jupyter_available,
+  is_jupytext_available as is_jupytext_available,
+  is_notebook_available as is_notebook_available,
+  is_peft_available as is_peft_available,
+  is_torch_available as is_torch_available,
+  is_transformers_available as is_transformers_available,
+  is_vllm_available as is_vllm_available,
+  lenient_issubclass as lenient_issubclass,
+  reserve_free_port as reserve_free_port,
+  resolve_filepath as resolve_filepath,
+  resolve_user_filepath as resolve_user_filepath,
+  serde as serde,
+  set_debug_mode as set_debug_mode,
+  set_disable_warnings as set_disable_warnings,
+  set_quiet_mode as set_quiet_mode,
+  validate_is_path as validate_is_path,
+)
+from openllm_core.utils.serde import converter as converter
+
+from .._llm import LLM
+
+def available_devices() -> Tuple[str, ...]: ...
+def device_count() -> int: ...
+def generate_labels(llm: LLM[Any, Any]) -> Dict[str, Any]: ...