feat(ctranslate): initial infrastructure support (#694)

* perf: compact and improve speed and agility

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* --wip--

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: cleanup infrastructure

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update styles notes and autogen mypy configuration

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-19 01:48:33 -05:00
committed by GitHub
parent 93ffb29e9f
commit 206521e02d
38 changed files with 507 additions and 642 deletions

View File

@@ -42,6 +42,7 @@ from openllm_core.utils import (
generate_hash_from_file,
get_disable_warnings,
get_quiet_mode,
getenv,
is_peft_available,
is_vllm_available,
resolve_filepath,
@@ -52,6 +53,7 @@ from .exceptions import ForbiddenAttributeError, OpenLLMException
from .serialisation.constants import PEFT_CONFIG_NAME
if t.TYPE_CHECKING:
import torch
import transformers
from peft.config import PeftConfig
@@ -109,8 +111,8 @@ def _torch_dtype_mapping():
return {
'half': torch.float16,
'float16': torch.float16,
'float': torch.float32,
'float16': torch.float16,
'float32': torch.float32,
'bfloat16': torch.bfloat16,
}
@@ -132,7 +134,8 @@ class LLM(t.Generic[M, T], ReprMixin):
_prompt_template: PromptTemplate | None
_system_message: str | None
__llm_torch_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
__llm_dtype__: LiteralDtype | t.Literal['auto', 'half', 'float'] = 'auto'
__llm_torch_dtype__: 'torch.dtype' = None
__llm_config__: LLMConfig | None = None
__llm_backend__: LiteralBackend = None # type: ignore
__llm_quantization_config__: transformers.BitsAndBytesConfig | transformers.GPTQConfig | transformers.AwqConfig | None = None
@@ -158,16 +161,23 @@ class LLM(t.Generic[M, T], ReprMixin):
serialisation='safetensors',
trust_remote_code=False,
embedded=False,
torch_dtype='auto',
dtype='auto',
low_cpu_mem_usage=True,
**attrs,
):
# backward compatible
torch_dtype = attrs.pop('torch_dtype', None)
if torch_dtype is not None:
logger.warning(
'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.'
)
dtype = torch_dtype
_local = False
if validate_is_path(model_id):
model_id, _local = resolve_filepath(model_id), True
backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt')
torch_dtype = first_not_none(os.getenv('TORCH_DTYPE'), torch_dtype, default='auto')
quantize = first_not_none(quantize, os.getenv('OPENLLM_QUANTIZE'), default=None)
dtype = first_not_none(getenv('dtype', default=dtype, var=['TORCH_DTYPE']), default='auto')
quantize = first_not_none(getenv('quantize', default=quantize, var=['QUANITSE']), default=None)
attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage})
# parsing tokenizer and model kwargs, as the hierarchy is param pass > default
model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
@@ -189,7 +199,7 @@ class LLM(t.Generic[M, T], ReprMixin):
system_message=system_message,
LLM__model_attrs=model_attrs,
LLM__tokenizer_attrs=tokenizer_attrs,
llm_torch_dtype__=torch_dtype.lower(),
llm_dtype__=torch_dtype.lower(),
llm_backend__=backend,
llm_config__=llm_config,
llm_trust_remote_code__=trust_remote_code,
@@ -222,15 +232,15 @@ class LLM(t.Generic[M, T], ReprMixin):
config_dtype = getattr(hf_config, 'torch_dtype', None)
if config_dtype is None:
config_dtype = torch.float32
if self.__llm_torch_dtype__ == 'auto':
if self.__llm_dtype__ == 'auto':
if config_dtype == torch.float32:
torch_dtype = torch.float16 # following common practice
else:
torch_dtype = config_dtype
else:
if self.__llm_torch_dtype__ not in _torch_dtype_mapping():
raise ValueError(f"Unknown dtype '{self.__llm_torch_dtype__}'")
torch_dtype = _torch_dtype_mapping()[self.__llm_torch_dtype__]
if self.__llm_dtype__ not in _torch_dtype_mapping():
raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
torch_dtype = _torch_dtype_mapping()[self.__llm_dtype__]
self.__llm_torch_dtype__ = torch_dtype
return self.__llm_torch_dtype__

View File

@@ -32,7 +32,8 @@ class IdentifyingParams(TypedDict):
model_id: str
ResolvedAdapterMap = Dict[AdapterType, Dict[str, Tuple[PeftConfig, str]]]
Dtype = Union[LiteralDtype, Literal['auto', 'half', 'float']]
CTranslateDtype = Literal['int8_float32', 'int8_float16', 'int8_bfloat16']
Dtype = Union[LiteralDtype, CTranslateDtype, Literal['auto', 'half', 'float']]
@attr.define(slots=True, repr=False, init=False)
class LLM(Generic[M, T]):
@@ -50,7 +51,8 @@ class LLM(Generic[M, T]):
_prompt_template: Optional[PromptTemplate]
_system_message: Optional[str]
__llm_torch_dtype__: Dtype = ...
__llm_dtype__: Dtype = ...
__llm_torch_dtype__: Optional[torch.dtype] = ...
__llm_config__: Optional[LLMConfig] = ...
__llm_backend__: LiteralBackend = ...
__llm_quantization_config__: Optional[QuantizationConfig] = ...

View File

@@ -1,12 +1,7 @@
from __future__ import annotations
from openllm_core.exceptions import MissingDependencyError
from openllm_core.utils import (
is_autoawq_available,
is_autogptq_available,
is_bitsandbytes_available,
is_optimum_supports_gptq,
)
from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available
def infer_quantisation_config(llm, quantise, **attrs):
@@ -98,7 +93,7 @@ def infer_quantisation_config(llm, quantise, **attrs):
elif quantise == 'int4':
quantisation_config = create_int4_config()
elif quantise == 'gptq':
if not is_autogptq_available() or not is_optimum_supports_gptq():
if not is_autogptq_available():
raise MissingDependencyError(
"GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
)

View File

@@ -1,18 +1,13 @@
"""Protocol-related packages for all library integrations.
Currently support OpenAI compatible API.
"""
from __future__ import annotations
import os
import typing as t
from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {'openai': []}
_import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}
if t.TYPE_CHECKING:
from . import openai as openai
from . import cohere as cohere, hf as hf, openai as openai
__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
__all__ = __lazy.__all__

View File

@@ -1,21 +1,26 @@
from __future__ import annotations
import importlib
import typing as t
import cloudpickle
import fs
from openllm_core._typing_compat import ParamSpec
from openllm_core._typing_compat import M, ParamSpec, T, TypeGuard
from openllm_core.exceptions import OpenLLMException
if t.TYPE_CHECKING:
from bentoml import Model
from .._llm import LLM
P = ParamSpec('P')
def load_tokenizer(llm, **tokenizer_attrs):
def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
"""Load the tokenizer from BentoML store.
By default, it will try to find the bentomodel whether it is in store..
If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
"""
import cloudpickle
import fs
from transformers import AutoTokenizer
tokenizer_attrs = {**llm.llm_parameters[-1], **tokenizer_attrs}
@@ -52,34 +57,39 @@ def load_tokenizer(llm, **tokenizer_attrs):
return tokenizer
_extras = ['get', 'import_model', 'load_model']
def _make_dispatch_function(fn):
def caller(llm, *args, **kwargs):
def caller(llm: LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> TypeGuard[M | T | Model]:
"""Generic function dispatch to correct serialisation submodules based on LLM runtime.
> [!NOTE] See 'openllm.serialisation.transformers' if 'llm.__llm_backend__ in ("pt", "vllm")'
> [!NOTE] See 'openllm.serialisation.ggml' if 'llm.__llm_backend__="ggml"'
> [!NOTE] See 'openllm.serialisation.ctranslate' if 'llm.__llm_backend__="ctranslate"'
"""
serde = 'transformers'
if llm.__llm_backend__ == 'ggml':
serde = 'ggml'
return getattr(importlib.import_module(f'.{serde}', __name__), fn)(llm, *args, **kwargs)
elif llm.__llm_backend__ == 'ctranslate':
serde = 'ctranslate'
elif llm.__llm_backend__ in {'pt', 'vllm'}:
serde = 'transformers'
else:
raise OpenLLMException(f'Not supported backend {llm.__llm_backend__}')
return getattr(importlib.import_module(f'.{serde}', 'openllm.serialisation'), fn)(llm, *args, **kwargs)
return caller
_import_structure: dict[str, list[str]] = {'ggml': [], 'transformers': [], 'constants': []}
__all__ = ['ggml', 'transformers', 'constants', 'load_tokenizer', *_extras]
_extras = ['get', 'import_model', 'load_model']
_import_structure = {'ggml', 'transformers', 'ctranslate', 'constants'}
__all__ = ['load_tokenizer', *_extras, *_import_structure]
def __dir__():
def __dir__() -> t.Sequence[str]:
return sorted(__all__)
def __getattr__(name):
def __getattr__(name: str) -> t.Any:
if name == 'load_tokenizer':
return load_tokenizer
elif name in _import_structure:

View File

@@ -63,9 +63,22 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon
metadata['_quantize'] = quantize
architectures = getattr(config, 'architectures', [])
if not architectures:
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
if trust_remote_code:
auto_map = getattr(config, 'auto_map', {})
if not auto_map:
raise RuntimeError(
f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}'
)
autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
if autoclass not in auto_map:
raise RuntimeError(
f"Given model '{llm.model_id}' is yet to be supported with 'auto_map'. OpenLLM currently only support encoder-decoders or decoders only models."
)
architectures = [auto_map[autoclass]]
else:
raise RuntimeError(
'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
)
metadata['_pretrained_class'] = architectures[0]
if not llm._local:
metadata['_revision'] = get_hash(config)
@@ -75,7 +88,7 @@ def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLCon
signatures = {}
if quantize == 'gptq':
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
if not openllm.utils.is_autogptq_available():
raise OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
)
@@ -213,7 +226,7 @@ def load_model(llm, *decls, **attrs):
if '_quantize' in llm.bentomodel.info.metadata:
_quantise = llm.bentomodel.info.metadata['_quantize']
if _quantise == 'gptq':
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
if not openllm.utils.is_autogptq_available():
raise OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
)

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import copy
import logging
import typing as t
import transformers
@@ -10,14 +11,14 @@ from openllm_core.utils import get_disable_warnings, get_quiet_mode
logger = logging.getLogger(__name__)
def get_hash(config) -> str:
def get_hash(config: transformers.PretrainedConfig) -> str:
_commit_hash = getattr(config, '_commit_hash', None)
if _commit_hash is None:
raise ValueError(f'Cannot find commit hash in {config}')
return _commit_hash
def process_config(model_id, trust_remote_code, **attrs):
def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any):
config = attrs.pop('config', None)
# this logic below is synonymous to handling `from_pretrained` attrs.
hub_attrs = {k: attrs.pop(k) for k in HUB_ATTRS if k in attrs}

View File

@@ -1,74 +1,10 @@
"""Utilities function for OpenLLM.
User can import these function for convenience, but
we won't ensure backward compatibility for these functions. So use with caution.
"""
from __future__ import annotations
import functools
import importlib.metadata
import typing as t
import openllm_core
if t.TYPE_CHECKING:
import openllm
from openllm_core.utils import (
DEBUG as DEBUG,
DEBUG_ENV_VAR as DEBUG_ENV_VAR,
DEV_DEBUG_VAR as DEV_DEBUG_VAR,
ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
MYPY as MYPY,
OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
QUIET_ENV_VAR as QUIET_ENV_VAR,
SHOW_CODEGEN as SHOW_CODEGEN,
LazyLoader as LazyLoader,
LazyModule as LazyModule,
ReprMixin as ReprMixin,
VersionInfo as VersionInfo,
analytics as analytics,
calc_dir_size as calc_dir_size,
check_bool_env as check_bool_env,
codegen as codegen,
configure_logging as configure_logging,
dantic as dantic,
field_env_key as field_env_key,
first_not_none as first_not_none,
flatten_attrs as flatten_attrs,
gen_random_uuid as gen_random_uuid,
generate_context as generate_context,
generate_hash_from_file as generate_hash_from_file,
get_debug_mode as get_debug_mode,
get_disable_warnings as get_disable_warnings,
get_quiet_mode as get_quiet_mode,
in_notebook as in_notebook,
is_autoawq_available as is_autoawq_available,
is_autogptq_available as is_autogptq_available,
is_bentoml_available as is_bentoml_available,
is_bitsandbytes_available as is_bitsandbytes_available,
is_grpc_available as is_grpc_available,
is_jupyter_available as is_jupyter_available,
is_jupytext_available as is_jupytext_available,
is_notebook_available as is_notebook_available,
is_optimum_supports_gptq as is_optimum_supports_gptq,
is_peft_available as is_peft_available,
is_torch_available as is_torch_available,
is_transformers_available as is_transformers_available,
is_vllm_available as is_vllm_available,
lenient_issubclass as lenient_issubclass,
reserve_free_port as reserve_free_port,
resolve_filepath as resolve_filepath,
resolve_user_filepath as resolve_user_filepath,
serde as serde,
set_debug_mode as set_debug_mode,
set_disable_warnings as set_disable_warnings,
set_quiet_mode as set_quiet_mode,
validate_is_path as validate_is_path,
)
from openllm_core.utils.serde import converter as converter
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
def generate_labels(llm):
return {
'backend': llm.__llm_backend__,
'framework': 'openllm',
@@ -79,27 +15,26 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
}
def available_devices() -> tuple[str, ...]:
"""Return available GPU under system. Currently only supports NVIDIA GPUs."""
def available_devices():
from .._strategies import NvidiaGpuResource
return tuple(NvidiaGpuResource.from_system())
@functools.lru_cache(maxsize=1)
def device_count() -> int:
def device_count():
return len(available_devices())
__all__ = ['generate_labels', 'available_devices', 'device_count']
def __dir__() -> t.Sequence[str]:
return sorted(__all__) + sorted(dir(openllm_core.utils))
def __dir__():
coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
return sorted(__all__) + sorted(list(coreutils))
def __getattr__(it: str) -> t.Any:
def __getattr__(it):
if hasattr(openllm_core.utils, it):
return getattr(openllm_core.utils, it)
else:
raise AttributeError(f'module {__name__} has no attribute {it}')
raise AttributeError(f'module {__name__} has no attribute {it}')

View File

@@ -0,0 +1,61 @@
from typing import Any, Dict, Tuple
from openllm_core.utils import (
DEBUG as DEBUG,
DEBUG_ENV_VAR as DEBUG_ENV_VAR,
DEV_DEBUG_VAR as DEV_DEBUG_VAR,
ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
MYPY as MYPY,
OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
QUIET_ENV_VAR as QUIET_ENV_VAR,
SHOW_CODEGEN as SHOW_CODEGEN,
LazyLoader as LazyLoader,
LazyModule as LazyModule,
ReprMixin as ReprMixin,
VersionInfo as VersionInfo,
analytics as analytics,
calc_dir_size as calc_dir_size,
check_bool_env as check_bool_env,
codegen as codegen,
configure_logging as configure_logging,
dantic as dantic,
field_env_key as field_env_key,
first_not_none as first_not_none,
flatten_attrs as flatten_attrs,
gen_random_uuid as gen_random_uuid,
generate_context as generate_context,
generate_hash_from_file as generate_hash_from_file,
get_debug_mode as get_debug_mode,
get_disable_warnings as get_disable_warnings,
get_quiet_mode as get_quiet_mode,
getenv as getenv,
in_notebook as in_notebook,
is_autoawq_available as is_autoawq_available,
is_autogptq_available as is_autogptq_available,
is_bentoml_available as is_bentoml_available,
is_bitsandbytes_available as is_bitsandbytes_available,
is_grpc_available as is_grpc_available,
is_jupyter_available as is_jupyter_available,
is_jupytext_available as is_jupytext_available,
is_notebook_available as is_notebook_available,
is_peft_available as is_peft_available,
is_torch_available as is_torch_available,
is_transformers_available as is_transformers_available,
is_vllm_available as is_vllm_available,
lenient_issubclass as lenient_issubclass,
reserve_free_port as reserve_free_port,
resolve_filepath as resolve_filepath,
resolve_user_filepath as resolve_user_filepath,
serde as serde,
set_debug_mode as set_debug_mode,
set_disable_warnings as set_disable_warnings,
set_quiet_mode as set_quiet_mode,
validate_is_path as validate_is_path,
)
from openllm_core.utils.serde import converter as converter
from .._llm import LLM
def available_devices() -> Tuple[str, ...]: ...
def device_count() -> int: ...
def generate_labels(llm: LLM[Any, Any]) -> Dict[str, Any]: ...