From 816c1ee80e4a79c908dad8fee0f5b050dd1514a4 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sun, 19 Nov 2023 10:25:08 -0500 Subject: [PATCH] feat(engine): CTranslate2 (#698) * chore: update instruction for dependencies Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat(experimental): CTranslate2 Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- README.md | 160 +++++++++++- all.sh | 6 +- mypy.ini | 2 +- openllm-client/src/openllm_client/_utils.pyi | 1 + .../src/openllm_core/_configuration.py | 2 +- openllm-core/src/openllm_core/_schemas.py | 21 +- .../src/openllm_core/_typing_compat.py | 9 +- .../openllm_core/config/configuration_auto.py | 15 +- .../config/configuration_baichuan.py | 1 + .../config/configuration_chatglm.py | 1 + .../config/configuration_mistral.py | 1 + .../openllm_core/config/configuration_phi.py | 1 + .../openllm_core/config/configuration_yi.py | 1 + openllm-python/README.md | 160 +++++++++++- openllm-python/src/openllm/_llm.py | 233 ++++++++++-------- openllm-python/src/openllm/_llm.pyi | 2 +- openllm-python/src/openllm/_runners.py | 86 ++++++- openllm-python/src/openllm/_runners.pyi | 24 +- .../src/openllm/serialisation/_helpers.py | 159 ++++++++++++ .../src/openllm/serialisation/_helpers.pyi | 24 ++ .../serialisation/ctranslate/__init__.py | 100 ++++++++ .../serialisation/transformers/__init__.py | 200 ++++----------- .../serialisation/transformers/_helpers.py | 12 +- openllm-python/src/openllm/utils/__init__.pyi | 1 + openllm-python/src/openllm_cli/_factory.py | 17 +- openllm-python/src/openllm_cli/_sdk.py | 4 +- openllm-python/src/openllm_cli/entrypoint.py | 21 +- .../openllm_cli/playground/falcon_tuned.py | 4 +- .../openllm_cli/playground/llama2_qlora.py | 4 +- .../src/openllm_cli/playground/opt_tuned.py | 4 +- tools/update-readme.py | 19 ++ 31 files changed, 945 insertions(+), 350 deletions(-) create mode 100644 openllm-python/src/openllm/serialisation/_helpers.py create mode 100644 openllm-python/src/openllm/serialisation/_helpers.pyi diff --git a/README.md b/README.md index 7dd67ca4..45ed8011 100644 --- a/README.md +++ b/README.md @@ -416,6 +416,25 @@ openllm start databricks/dolly-v2-3b --backend vllm openllm start databricks/dolly-v2-3b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start databricks/dolly-v2-3b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -494,6 +513,25 @@ openllm start tiiuae/falcon-7b --backend vllm openllm start tiiuae/falcon-7b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start tiiuae/falcon-7b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -615,6 +653,25 @@ openllm start eleutherai/gpt-neox-20b --backend vllm openllm start eleutherai/gpt-neox-20b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start eleutherai/gpt-neox-20b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -624,14 +681,6 @@ openllm start eleutherai/gpt-neox-20b --backend pt ### Quickstart - - -> **Note:** Llama requires to install with: -> ```bash -> pip install "openllm[llama]" -> ``` - - Run the following command to quickly spin up a Llama server: ```bash @@ -701,6 +750,25 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm openllm start meta-llama/Llama-2-70b-chat-hf --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -852,6 +920,25 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -924,6 +1011,25 @@ openllm start facebook/opt-125m --backend vllm openllm start facebook/opt-125m --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start facebook/opt-125m --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -1061,6 +1167,25 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -1137,6 +1262,25 @@ openllm start bigcode/starcoder --backend vllm openllm start bigcode/starcoder --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start bigcode/starcoder --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
diff --git a/all.sh b/all.sh index c9fffb04..ba2da67b 100644 --- a/all.sh +++ b/all.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash -printf "Running mirror.sh\n" -bash ./tools/mirror.sh printf "Running update-mypy.py\n" python ./tools/update-mypy.py printf "Running update-config-stubs.py\n" python ./tools/dependencies.py printf "Running dependencies.py\n" python ./tools/update-config-stubs.py +printf "Running update-readme.py\n" +python ./tools/update-readme.py +printf "Running mirror.sh\n" +bash ./tools/mirror.sh diff --git a/mypy.ini b/mypy.ini index 8a9c2a4f..c63d8cc2 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,4 +8,4 @@ warn_unused_configs = true ignore_missing_imports = true check_untyped_defs = true warn_unreachable = true -files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/utils/__init__.pyi +files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/utils/__init__.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi diff --git a/openllm-client/src/openllm_client/_utils.pyi b/openllm-client/src/openllm_client/_utils.pyi index a66f9ce9..6aa84fc2 100644 --- a/openllm-client/src/openllm_client/_utils.pyi +++ b/openllm-client/src/openllm_client/_utils.pyi @@ -37,6 +37,7 @@ from openllm_core.utils.import_utils import ( is_autogptq_available as is_autogptq_available, is_bentoml_available as is_bentoml_available, is_bitsandbytes_available as is_bitsandbytes_available, + is_ctranslate_available as is_ctranslate_available, is_grpc_available as is_grpc_available, is_jupyter_available as is_jupyter_available, is_jupytext_available as is_jupytext_available, diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 37cd3f4d..c01ac284 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -457,7 +457,7 @@ _DEFAULT = _ModelSettingsAttr( model_ids=['__default__'], architecture='PreTrainedModel', serialisation='legacy', - backend=('pt', 'vllm'), + backend=('pt', 'vllm', 'ctranslate'), name_type='dasherize', url='', model_type='causal_lm', diff --git a/openllm-core/src/openllm_core/_schemas.py b/openllm-core/src/openllm_core/_schemas.py index 23e54420..f8e94994 100644 --- a/openllm-core/src/openllm_core/_schemas.py +++ b/openllm-core/src/openllm_core/_schemas.py @@ -7,7 +7,7 @@ import orjson from ._configuration import LLMConfig from .config import AutoConfig -from .utils import ReprMixin, converter, gen_random_uuid +from .utils import converter, gen_random_uuid if t.TYPE_CHECKING: import vllm @@ -15,15 +15,8 @@ if t.TYPE_CHECKING: from ._typing_compat import Self -@attr.define(repr=False) -class _SchemaMixin(ReprMixin): - @property - def __repr_keys__(self): - return list(attr.fields_dict(self.__class__)) - - def __repr_args__(self): - yield from ((k, getattr(self, k)) for k in self.__repr_keys__) - +@attr.define +class _SchemaMixin: def model_dump(self) -> dict[str, t.Any]: return converter.unstructure(self) @@ -34,7 +27,7 @@ class _SchemaMixin(ReprMixin): return attr.evolve(self, **options) -@attr.define(repr=False) +@attr.define class MetadataOutput(_SchemaMixin): model_id: str timeout: int @@ -56,7 +49,7 @@ class MetadataOutput(_SchemaMixin): } -@attr.define(repr=False) +@attr.define class GenerationInput(_SchemaMixin): prompt: str llm_config: LLMConfig @@ -116,7 +109,7 @@ PromptLogprobs = t.List[t.Optional[t.Dict[int, float]]] FinishReason = t.Literal['length', 'stop'] -@attr.define(repr=False) +@attr.define class CompletionChunk(_SchemaMixin): index: int text: str @@ -129,7 +122,7 @@ class CompletionChunk(_SchemaMixin): return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8') -@attr.define(repr=False) +@attr.define class GenerationOutput(_SchemaMixin): prompt: str finished: bool diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index fbe2f719..e13bd753 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -5,17 +5,20 @@ import typing as t import attr if t.TYPE_CHECKING: + from ctranslate2 import Generator, Translator from peft.peft_model import PeftModel from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast from .utils.lazy import VersionInfo else: # NOTE: t.Any is also a type - PeftModel = PreTrainedModel = PreTrainedTokenizer = PreTrainedTokenizerBase = PreTrainedTokenizerFast = t.Any + PeftModel = ( + PreTrainedModel + ) = PreTrainedTokenizer = PreTrainedTokenizerBase = PreTrainedTokenizerFast = Generator = Translator = t.Any # NOTE: that VersionInfo is from openllm.utils.lazy.VersionInfo VersionInfo = t.Any -M = t.TypeVar('M', bound=t.Union[PreTrainedModel, PeftModel]) +M = t.TypeVar('M', bound=t.Union[PreTrainedModel, PeftModel, Generator, Translator]) T = t.TypeVar('T', bound=t.Union[PreTrainedTokenizerFast, PreTrainedTokenizer, PreTrainedTokenizerBase]) @@ -33,7 +36,7 @@ At = t.TypeVar('At', bound=attr.AttrsInstance) LiteralDtype = t.Literal['float16', 'float32', 'bfloat16', 'int8', 'int16'] LiteralSerialisation = t.Literal['safetensors', 'legacy'] LiteralQuantise = t.Literal['int8', 'int4', 'gptq', 'awq', 'squeezellm'] -LiteralBackend = t.Literal['pt', 'vllm', 'ctranslate', 'ggml', 'mlc'] +LiteralBackend = t.Literal['pt', 'vllm', 'ctranslate'] # TODO: ggml AdapterType = t.Literal[ 'lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3', 'loha', 'lokr' ] diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 55bc5c79..92096fb6 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -185,15 +185,20 @@ class AutoConfig: f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}." ) + _cached_mapping = None + + @classmethod + def _CONFIG_MAPPING_NAMES_TO_ARCHITECTURE(cls) -> dict[str, str]: + if cls._cached_mapping is None: + AutoConfig._cached_mapping = {v.__config__['architecture']: k for k, v in CONFIG_MAPPING.items()} + return AutoConfig._cached_mapping + @classmethod def infer_class_from_llm(cls, llm: openllm.LLM[M, T]) -> type[openllm_core.LLMConfig]: if not is_bentoml_available(): raise MissingDependencyError( "'infer_class_from_llm' requires 'bentoml' to be available. Make sure to install it with 'pip install bentoml'" ) - CONFIG_MAPPING_NAMES_TO_ARCHITECTURE: dict[str, str] = { - v.__config__['architecture']: k for k, v in CONFIG_MAPPING.items() - } if llm._local: config_file = os.path.join(llm.model_id, CONFIG_FILE_NAME) else: @@ -218,8 +223,8 @@ class AutoConfig: loaded_config = orjson.loads(f.read()) if 'architectures' in loaded_config: for architecture in loaded_config['architectures']: - if architecture in CONFIG_MAPPING_NAMES_TO_ARCHITECTURE: - return cls.infer_class_from_name(CONFIG_MAPPING_NAMES_TO_ARCHITECTURE[architecture]) + if architecture in cls._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE(): + return cls.infer_class_from_name(cls._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]) raise ValueError( f"Failed to determine config class for '{llm.model_id}'. Make sure {llm.model_id} is saved with openllm." ) diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py index a9540f3d..a899e53d 100644 --- a/openllm-core/src/openllm_core/config/configuration_baichuan.py +++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py @@ -25,6 +25,7 @@ class BaichuanConfig(openllm_core.LLMConfig): 'timeout': 3600000, 'url': 'https://github.com/baichuan-inc/Baichuan-7B', 'requirements': ['cpm-kernels'], + 'backend': ('pt', 'vllm'), 'architecture': 'BaiChuanForCausalLM', # NOTE: See the following # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555 diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py index 7a205349..c700ad52 100644 --- a/openllm-core/src/openllm_core/config/configuration_chatglm.py +++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py @@ -31,6 +31,7 @@ class ChatGLMConfig(openllm_core.LLMConfig): 'timeout': 3600000, 'url': 'https://github.com/THUDM/ChatGLM-6B', 'requirements': ['cpm-kernels'], + 'backend': ('pt', 'vllm'), 'architecture': 'ChatGLMModel', 'default_id': 'thudm/chatglm-6b', 'model_ids': [ diff --git a/openllm-core/src/openllm_core/config/configuration_mistral.py b/openllm-core/src/openllm_core/config/configuration_mistral.py index 94ce4e5d..95531aee 100644 --- a/openllm-core/src/openllm_core/config/configuration_mistral.py +++ b/openllm-core/src/openllm_core/config/configuration_mistral.py @@ -32,6 +32,7 @@ class MistralConfig(openllm_core.LLMConfig): 'add_generation_prompt': True, 'default_id': 'mistralai/Mistral-7B-Instruct-v0.1', 'serialisation': 'safetensors', + 'backend': ('pt', 'vllm'), # NOTE: see https://docs.mistral.ai/usage/guardrailing/ # and https://docs.mistral.ai/llm/mistral-instruct-v0.1 'model_ids': [ diff --git a/openllm-core/src/openllm_core/config/configuration_phi.py b/openllm-core/src/openllm_core/config/configuration_phi.py index 01ea62cf..31abee79 100644 --- a/openllm-core/src/openllm_core/config/configuration_phi.py +++ b/openllm-core/src/openllm_core/config/configuration_phi.py @@ -26,6 +26,7 @@ class PhiConfig(openllm_core.LLMConfig): 'url': 'https://arxiv.org/abs/2309.05463', 'architecture': 'PhiForCausalLM', 'trust_remote_code': True, + 'backend': ('pt', 'vllm'), 'default_id': 'microsoft/phi-1_5', 'serialisation': 'safetensors', 'model_ids': ['microsoft/phi-1_5'], diff --git a/openllm-core/src/openllm_core/config/configuration_yi.py b/openllm-core/src/openllm_core/config/configuration_yi.py index f40677f1..ce33f5ed 100644 --- a/openllm-core/src/openllm_core/config/configuration_yi.py +++ b/openllm-core/src/openllm_core/config/configuration_yi.py @@ -22,6 +22,7 @@ class YiConfig(openllm_core.LLMConfig): 'url': 'https://01.ai/', 'architecture': 'YiForCausalLM', 'trust_remote_code': True, + 'backend': ('pt', 'vllm'), 'default_id': '01-ai/Yi-6B', 'serialisation': 'safetensors', 'model_ids': ['01-ai/Yi-6B', '01-ai/Yi-34B', '01-ai/Yi-6B-200K', '01-ai/Yi-34B-200K'], diff --git a/openllm-python/README.md b/openllm-python/README.md index 7dd67ca4..45ed8011 100644 --- a/openllm-python/README.md +++ b/openllm-python/README.md @@ -416,6 +416,25 @@ openllm start databricks/dolly-v2-3b --backend vllm openllm start databricks/dolly-v2-3b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start databricks/dolly-v2-3b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -494,6 +513,25 @@ openllm start tiiuae/falcon-7b --backend vllm openllm start tiiuae/falcon-7b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start tiiuae/falcon-7b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -615,6 +653,25 @@ openllm start eleutherai/gpt-neox-20b --backend vllm openllm start eleutherai/gpt-neox-20b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start eleutherai/gpt-neox-20b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -624,14 +681,6 @@ openllm start eleutherai/gpt-neox-20b --backend pt ### Quickstart - - -> **Note:** Llama requires to install with: -> ```bash -> pip install "openllm[llama]" -> ``` - - Run the following command to quickly spin up a Llama server: ```bash @@ -701,6 +750,25 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm openllm start meta-llama/Llama-2-70b-chat-hf --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -852,6 +920,25 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -924,6 +1011,25 @@ openllm start facebook/opt-125m --backend vllm openllm start facebook/opt-125m --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start facebook/opt-125m --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -1061,6 +1167,25 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
@@ -1137,6 +1262,25 @@ openllm start bigcode/starcoder --backend vllm openllm start bigcode/starcoder --backend pt ``` +- CTranslate2 (*experimental*): + + +```bash +openllm start bigcode/starcoder --backend ctranslate +``` + + +> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16 + + + +> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information. + + + +> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case. + +
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index db25774e..4e707a09 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -1,6 +1,5 @@ from __future__ import annotations import functools -import importlib.util import logging import os import types @@ -43,6 +42,7 @@ from openllm_core.utils import ( get_disable_warnings, get_quiet_mode, getenv, + is_ctranslate_available, is_peft_available, is_vllm_available, resolve_filepath, @@ -165,32 +165,28 @@ class LLM(t.Generic[M, T], ReprMixin): low_cpu_mem_usage=True, **attrs, ): - # backward compatible - torch_dtype = attrs.pop('torch_dtype', None) - if torch_dtype is not None: - logger.warning( - 'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.' - ) - dtype = torch_dtype + # fmt: off + torch_dtype = attrs.pop('torch_dtype',None) # backward compatible + if torch_dtype is not None:logger.warning('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.');dtype=torch_dtype _local = False - if validate_is_path(model_id): - model_id, _local = resolve_filepath(model_id), True - backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt') - dtype = first_not_none(getenv('dtype', default=dtype, var=['TORCH_DTYPE']), default='auto') - quantize = first_not_none(getenv('quantize', default=quantize, var=['QUANITSE']), default=None) - attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage}) + if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True + backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend()) + dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto') + quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None) + attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage}) # parsing tokenizer and model kwargs, as the hierarchy is param pass > default model_attrs, tokenizer_attrs = flatten_attrs(**attrs) if model_tag is None: - model_tag, model_version = self._make_tag_components(model_id, model_version, backend=backend) - if model_version: - model_tag = f'{model_tag}:{model_version}' + model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend) + if model_version:model_tag=f'{model_tag}:{model_version}' + # fmt: on + self.__attrs_init__( model_id=model_id, revision=model_version, tag=bentoml.Tag.from_taglike(model_tag), quantization_config=quantization_config, - quantise=quantize, + quantise=self._resolve_quantise(quantize, backend), model_decls=args, adapter_map=_resolve_peft_config_type(adapter_map) if adapter_map is not None else None, serialisation=serialisation, @@ -217,63 +213,66 @@ class LLM(t.Generic[M, T], ReprMixin): ) self.runner.init_local(quiet=True) + # fmt: off + def _resolve_quantise(self, quantise, backend): + if backend in ('pt', 'vllm'):return quantise + if backend=='ctranslate':return self._resolve_ctranslate_quantise(quantise) + raise NotImplementedError(f"Quantisation is not supported for backend '{self.__llm_backend__}'") + def _resolve_ctranslate_quantise(self,quantise): + if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'") + if quantise == 'int8':quantise='int8_float16' if self._has_gpus else 'int8_float32' + return quantise + @apply(lambda val:tuple(str.lower(i) if i else i for i in val)) + def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]: + model_id,*maybe_revision=model_id.rsplit(':') + if len(maybe_revision)>0: + if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version) + model_version = maybe_revision[0] + if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id)) + return f'{backend}-{normalise_model_name(model_id)}',model_version + @functools.cached_property + def _has_gpus(self): + try: + from cuda import cuda + err,*_=cuda.cuInit(0) + if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.') + err,num_gpus=cuda.cuDeviceGetCount() + if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.') + return True + except (ImportError, RuntimeError):return False @property def _torch_dtype(self): - import torch - import transformers - - if not isinstance(self.__llm_torch_dtype__, torch.dtype): - try: - hf_config = transformers.AutoConfig.from_pretrained( - self.bentomodel.path, trust_remote_code=self.trust_remote_code - ) - except OpenLLMException: - hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code) - config_dtype = getattr(hf_config, 'torch_dtype', None) - if config_dtype is None: - config_dtype = torch.float32 - if self.__llm_dtype__ == 'auto': - if config_dtype == torch.float32: - torch_dtype = torch.float16 # following common practice - else: - torch_dtype = config_dtype + import torch, transformers # noqa: I001 + _map=_torch_dtype_mapping() + if not isinstance(self.__llm_torch_dtype__,torch.dtype): + try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code) + except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code) + config_dtype=getattr(hf_config,'torch_dtype',None) + if config_dtype is None:config_dtype=torch.float32 + if self.__llm_dtype__=='auto': + if config_dtype==torch.float32:torch_dtype=torch.float16 + else:torch_dtype=config_dtype else: - if self.__llm_dtype__ not in _torch_dtype_mapping(): - raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'") - torch_dtype = _torch_dtype_mapping()[self.__llm_dtype__] - self.__llm_torch_dtype__ = torch_dtype + if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'") + torch_dtype=_map[self.__llm_dtype__] + self.__llm_torch_dtype__=torch_dtype return self.__llm_torch_dtype__ - - @apply(lambda val: tuple(str.lower(i) if i else i for i in val)) - def _make_tag_components(self, model_id, model_version, backend) -> tuple[str, str | None]: - model_id, *maybe_revision = model_id.rsplit(':') - if len(maybe_revision) > 0: - if model_version is not None: - logger.warning( - "revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", - maybe_revision[0], - model_version, - ) - model_version = maybe_revision[0] - if validate_is_path(model_id): - model_id, model_version = ( - resolve_filepath(model_id), - first_not_none(model_version, default=generate_hash_from_file(model_id)), - ) - return f'{backend}-{normalise_model_name(model_id)}', model_version - - def __setattr__(self, attr, value): - if attr in _reserved_namespace: - raise ForbiddenAttributeError(f'{attr} should not be set during runtime.') - super().__setattr__(attr, value) - - # fmt: off @property def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs} @_model_attrs.setter def _model_attrs(self, value):self.__model_attrs = value @property def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs} + def _cascade_backend(self)->LiteralBackend: + if self._has_gpus: + if is_vllm_available():return 'vllm' + elif is_ctranslate_available():return 'ctranslate' # XXX: base OpenLLM image should always include vLLM + elif is_ctranslate_available():return 'ctranslate' + else:return 'pt' + def __setattr__(self,attr,value): + if attr in _reserved_namespace:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.') + super().__setattr__(attr, value) + def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__ @property def __repr_keys__(self):return {'model_id','revision','backend','type'} def __repr_args__(self): @@ -282,10 +281,10 @@ class LLM(t.Generic[M, T], ReprMixin): yield 'backend',self.__llm_backend__ yield 'type',self.llm_type @property - def import_kwargs(self):import torch;return {'device_map':'auto' if torch.cuda.is_available() else None, 'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'} # noqa: I001 + def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'} @property def trust_remote_code(self): - env = os.getenv('TRUST_REMOTE_CODE') + env=os.getenv('TRUST_REMOTE_CODE') if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES return self.__llm_trust_remote_code__ @property @@ -319,10 +318,6 @@ class LLM(t.Generic[M, T], ReprMixin): @property def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id} @property - def config(self): - if self.__llm_config__ is None:self.__llm_config__=openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) - return self.__llm_config__ - @property def tokenizer(self): if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1]) return self.__llm_tokenizer__ @@ -330,8 +325,42 @@ class LLM(t.Generic[M, T], ReprMixin): def runner(self): if self.__llm_runner__ is None:self.__llm_runner__=_RunnerFactory(self) return self.__llm_runner__ + def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs): + if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.') + from peft.mapping import get_peft_model + from peft.utils.other import prepare_model_for_kbit_training + model=get_peft_model( + prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking), + self.config['fine_tune_strategies'] + .get(adapter_type,self.config.make_fine_tune_config(adapter_type)) + .train() + .with_config(**attrs) + .build(), + ) + if DEBUG:model.print_trainable_parameters() + return model,self.tokenizer + def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs) # fmt: on + @property + def adapter_map(self): + if not is_peft_available(): + raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'") + if not self.has_adapters: + raise AttributeError('Adapter map is not available.') + assert self._adapter_map is not None + if self.__llm_adapter_map__ is None: + _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map} + for adapter_type, adapter_tuple in self._adapter_map.items(): + base = first_not_none( + self.config['fine_tune_strategies'].get(adapter_type), + default=self.config.make_fine_tune_config(adapter_type), + ) + for adapter in adapter_tuple: + _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id) + self.__llm_adapter_map__ = _map + return self.__llm_adapter_map__ + @property def model(self): if self.__llm_model__ is None: @@ -359,41 +388,31 @@ class LLM(t.Generic[M, T], ReprMixin): return self.__llm_model__ @property - def adapter_map(self): - if importlib.util.find_spec('peft') is None: - raise MissingDependencyError("Failed to import 'peft'. Make sure to do 'pip install \"openllm[fine-tune]\"'") - if not self.has_adapters: - raise AttributeError('Adapter map is not available.') - assert self._adapter_map is not None - if self.__llm_adapter_map__ is None: - _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map} - for adapter_type, adapter_tuple in self._adapter_map.items(): - base = first_not_none( - self.config['fine_tune_strategies'].get(adapter_type), - default=self.config.make_fine_tune_config(adapter_type), - ) - for adapter in adapter_tuple: - _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id) - self.__llm_adapter_map__ = _map - return self.__llm_adapter_map__ + def config(self): + import transformers - def prepare_for_training(self, adapter_type='lora', use_gradient_checking=True, **attrs): - from peft.mapping import get_peft_model - from peft.utils.other import prepare_model_for_kbit_training - - peft_config = ( - self.config['fine_tune_strategies'] - .get(adapter_type, self.config.make_fine_tune_config(adapter_type)) - .train() - .with_config(**attrs) - .build() - ) - model = get_peft_model( - prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking), peft_config - ) - if DEBUG: - model.print_trainable_parameters() - return model, self.tokenizer + if self.__llm_config__ is None: + if self.__llm_backend__ == 'ctranslate': + try: + config = transformers.AutoConfig.from_pretrained( + self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code + ) + except OpenLLMException: + config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code) + for architecture in config.architectures: + if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE(): + config = openllm.AutoConfig.infer_class_from_name( + openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture] + ).model_construct_env(**self._model_attrs) + break + else: + raise OpenLLMException( + f"Failed to infer the configuration class from the given model. Make sure the model is a supported model. Supported models are: {', '.join(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE.keys())}" + ) + else: + config = openllm.AutoConfig.infer_class_from_llm(self).model_construct_env(**self._model_attrs) + self.__llm_config__ = config + return self.__llm_config__ async def generate( self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs @@ -476,7 +495,7 @@ def _RunnerFactory( scheduling_strategy = CascadingResourceStrategy - backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND', default=llm.__llm_backend__)) + backend = first_not_none(getenv('backend', default=backend), default=llm.__llm_backend__) models = models if models is not None else [] try: @@ -533,7 +552,7 @@ def _RunnerFactory( } ), )( - runnable(backend), + runnable(llm, backend), name=llm.runner_name, embedded=False, models=models, diff --git a/openllm-python/src/openllm/_llm.pyi b/openllm-python/src/openllm/_llm.pyi index 8a96e066..431410fe 100644 --- a/openllm-python/src/openllm/_llm.pyi +++ b/openllm-python/src/openllm/_llm.pyi @@ -135,7 +135,7 @@ class LLM(Generic[M, T]): def runner(self) -> Runner[M, T]: ... @property def adapter_map(self) -> ResolvedAdapterMap: ... - def prepare_for_training( + def prepare( self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any ) -> Tuple[InjectedModel, T]: ... async def generate( diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py index b3c2982c..c428919f 100644 --- a/openllm-python/src/openllm/_runners.py +++ b/openllm-python/src/openllm/_runners.py @@ -1,6 +1,5 @@ from __future__ import annotations import gc -import os import traceback import typing as t @@ -10,14 +9,87 @@ import bentoml import openllm from openllm_core._schemas import CompletionChunk, GenerationOutput, SampleLogprobs from openllm_core.exceptions import OpenLLMException -from openllm_core.utils import first_not_none, is_vllm_available +from openllm_core.utils import first_not_none, getenv, is_ctranslate_available __all__ = ['runnable'] -def runnable(backend=None): - backend = first_not_none(backend, os.getenv('OPENLLM_BACKEND'), default='vllm' if is_vllm_available() else 'pt') - return vLLMRunnable if backend == 'vllm' else PyTorchRunnable +def runnable(llm, backend=None): + backend = first_not_none(getenv('backend', default=backend), default=llm._cascade_backend()) + if backend == 'vllm': + return vLLMRunnable + elif backend == 'pt': + return PyTorchRunnable + elif backend == 'ctranslate': + return CTranslateRunnable + else: + raise OpenLLMException(f'Unsupported backend: {backend}') + + +class CTranslateRunnable(bentoml.Runnable): + SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'cpu') + SUPPORTS_CPU_MULTI_THREADING = True + + def __init__(self, llm): + if not is_ctranslate_available(): + raise OpenLLMException('ctranslate is not installed. Please install it with `pip install "openllm[ctranslate]"`') + self.config = llm.config + self.model = llm.model + self.tokenizer = llm.tokenizer + + @bentoml.Runnable.method(batchable=False) + async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs): + if adapter_name is not None: + raise NotImplementedError('Adapter is not supported with CTranslate.') + + stop_ = set() + if isinstance(stop, str) and stop != '': + stop_.add(stop) + elif isinstance(stop, t.Iterable): + stop_.update(stop) + + config = self.config.model_construct_env(stop=list(stop_), **attrs) + sampling_params = dict( + max_length=config['max_new_tokens'], + min_length=config['min_length'], + sampling_topk=config['top_k'], + sampling_topp=config['top_p'], + sampling_temperature=config['temperature'], + return_log_prob=config['logprobs'] > 0, + repetition_penalty=config['repetition_penalty'], + no_repeat_ngram_size=config['no_repeat_ngram_size'], + end_token=config['stop'], + ) + cumulative_logprob = 0.0 + output_token_ids = list(prompt_token_ids) + input_len = len(prompt_token_ids) + async for request_output in self.model.async_generate_tokens( + self.tokenizer.convert_ids_to_tokens(prompt_token_ids), **sampling_params + ): + cumulative_logprob += request_output.log_prob if config['logprobs'] else 0.0 + output_token_ids.append(request_output.token_id) + text = self.tokenizer.decode( + output_token_ids[input_len:], + skip_special_tokens=True, + spaces_between_special_tokens=False, + clean_up_tokenization_spaces=True, + ) + yield GenerationOutput( + prompt='', + finished=request_output.is_last, + outputs=[ + CompletionChunk( + index=0, + text=text, + token_ids=output_token_ids[input_len:], + cumulative_logprob=cumulative_logprob, + finish_reason=None, + # TODO: logprobs, but seems like we don't have access to the raw logits + ) + ], + prompt_token_ids=prompt_token_ids, + request_id=request_id, + ).model_dump_json() class vLLMRunnable(bentoml.Runnable): @@ -44,7 +116,7 @@ class vLLMRunnable(bentoml.Runnable): trust_remote_code=llm.trust_remote_code, tokenizer_mode='auto', tensor_parallel_size=num_gpus, - dtype=str(llm._torch_dtype).split('.')[-1], + dtype=llm._torch_dtype, quantization=quantization, worker_use_ray=False, engine_use_ray=False, @@ -242,7 +314,7 @@ class PyTorchRunnable(bentoml.Runnable): CompletionChunk( index=0, text=text, - token_ids=output_token_ids[input_len:], + token_ids=tmp_output_ids, cumulative_logprob=cumulative_logprob, logprobs=sample_logprobs if config['logprobs'] else None, finish_reason=None, diff --git a/openllm-python/src/openllm/_runners.pyi b/openllm-python/src/openllm/_runners.pyi index ece66350..ef685af5 100644 --- a/openllm-python/src/openllm/_runners.pyi +++ b/openllm-python/src/openllm/_runners.pyi @@ -18,7 +18,7 @@ from typing import ( from bentoml import Model, Strategy, Tag from bentoml._internal.runner.runner_handle import RunnerHandle from openllm_core import LLMConfig -from openllm_core._typing_compat import LiteralBackend, T, overload +from openllm_core._typing_compat import LiteralBackend, M, T, overload from ._llm import LLM @@ -32,10 +32,16 @@ try: except ImportError: PreTrainedModel = Any +try: + from ctranslate2 import Generator, Translator +except ImportError: + Translator = Any + Generator = Any + Mo = TypeVar('Mo') class _Runnable(Protocol[Mo]): - SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu'], Literal['amd.com/gpu'], Literal['cpu']] = ... + SUPPORTED_RESOURCES: Tuple[Literal['nvidia.com/gpu', 'amd.com/gpu', 'cpu'], ...] = ... SUPPORTS_CPU_MULTI_THREADING: bool = ... config: LLMConfig = ... model: Mo = ... @@ -57,6 +63,10 @@ class RunnerMethod(Generic[In, Ret]): ... @final class vLLMRunnable(_Runnable[AsyncLLMEngine]): ... +@final +class CTranslateRunnable(_Runnable[Union[Translator, Generator]]): + tokenizer: Any + @final class PyTorchRunnable(_Runnable[PreTrainedModel]): tokenizer: Any @@ -70,11 +80,15 @@ class PyTorchRunnable(_Runnable[PreTrainedModel]): ) -> AsyncGenerator[str, None]: ... @overload -def runnable(backend: Literal['vllm']) -> Type[vLLMRunnable]: ... +def runnable(llm: LLM[M, T], backend: Literal['vllm']) -> Type[vLLMRunnable]: ... @overload -def runnable(backend: Literal['pt']) -> Type[PyTorchRunnable]: ... +def runnable(llm: LLM[M, T], backend: Literal['pt']) -> Type[PyTorchRunnable]: ... @overload -def runnable(backend: Optional[str] = ...) -> Type[Union[vLLMRunnable, PyTorchRunnable]]: ... +def runnable(llm: LLM[M, T], backend: Literal['ctranslate']) -> Type[CTranslateRunnable]: ... +@overload +def runnable( + llm: LLM[M, T], backend: Optional[str] = ... +) -> Type[Union[vLLMRunnable, PyTorchRunnable, CTranslateRunnable]]: ... class Runner(Protocol[Mo, T]): __doc__: str = ... diff --git a/openllm-python/src/openllm/serialisation/_helpers.py b/openllm-python/src/openllm/serialisation/_helpers.py new file mode 100644 index 00000000..23643bae --- /dev/null +++ b/openllm-python/src/openllm/serialisation/_helpers.py @@ -0,0 +1,159 @@ +import contextlib + +import attr +from simple_di import Provide, inject + +import bentoml +import openllm +from bentoml._internal.configuration.containers import BentoMLContainer +from bentoml._internal.models.model import ModelOptions, ModelSignature +from openllm_core.exceptions import OpenLLMException +from openllm_core.utils import is_autogptq_available + +_object_setattr = object.__setattr__ + + +def get_hash(config) -> str: + _commit_hash = getattr(config, '_commit_hash', None) + if _commit_hash is None: + raise ValueError(f'Cannot find commit hash in {config}') + return _commit_hash + + +def patch_correct_tag(llm, config, _revision=None): + # NOTE: The following won't hit during local since we generated a correct version based on local path hash It will only hit if we use model from HF Hub + if llm.revision is not None: + return + if not llm.local: + try: + if _revision is None: + _revision = get_hash(config) + except ValueError: + pass + if _revision is None and llm.tag.version is not None: + _revision = llm.tag.version + if llm.tag.version is None: + # HACK: This copies the correct revision into llm.tag + _object_setattr(llm, '_tag', attr.evolve(llm.tag, version=_revision)) + if llm._revision is None: + _object_setattr(llm, '_revision', _revision) # HACK: This copies the correct revision into llm._model_version + + +def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadata=None): + if metadata is None: + metadata = {} + metadata.update({'safe_serialisation': safe_serialisation, '_framework': llm.__llm_backend__}) + if llm.quantise: + metadata['_quantize'] = llm.quantise + architectures = getattr(config, 'architectures', []) + if not architectures: + if trust_remote_code: + auto_map = getattr(config, 'auto_map', {}) + if not auto_map: + raise RuntimeError( + f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}' + ) + autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM' + if autoclass not in auto_map: + raise RuntimeError( + f"Given model '{llm.model_id}' is yet to be supported with 'auto_map'. OpenLLM currently only support encoder-decoders or decoders only models." + ) + architectures = [auto_map[autoclass]] + else: + raise RuntimeError( + 'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`' + ) + metadata.update( + {'_pretrained_class': architectures[0], '_revision': get_hash(config) if not llm.local else llm.revision} + ) + return metadata + + +def _create_signatures(llm, signatures=None): + if signatures is None: + signatures = {} + if llm.__llm_backend__ == 'pt': + if llm.quantise == 'gptq': + if not is_autogptq_available(): + raise OpenLLMException( + "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'" + ) + signatures['generate'] = {'batchable': False} + else: + signatures.update( + { + k: ModelSignature(batchable=False) + for k in ( + '__call__', + 'forward', + 'generate', + 'contrastive_search', + 'greedy_search', + 'sample', + 'beam_search', + 'beam_sample', + 'group_beam_search', + 'constrained_beam_search', + ) + } + ) + elif llm.__llm_backend__ == 'ctranslate': + if llm.config['model_type'] == 'seq2seq_lm': + non_batch_keys = {'score_file', 'translate_file'} + batch_keys = {'generate_tokens', 'score_batch', 'translate_batch', 'translate_iterable', 'score_iterable'} + else: + non_batch_keys = set() + batch_keys = { + 'async_generate_tokens', + 'forward_batch', + 'generate_batch', + 'generate_iterable', + 'generate_tokens', + 'score_batch', + 'score_iterable', + } + signatures.update({k: ModelSignature(batchable=False) for k in non_batch_keys}) + signatures.update({k: ModelSignature(batchable=True) for k in batch_keys}) + return signatures + + +@inject +@contextlib.contextmanager +def save_model( + llm, + config, + safe_serialisation, + trust_remote_code, + module, + external_modules, + _model_store=Provide[BentoMLContainer.model_store], + _api_version='v2.1.0', +): + imported_modules = [] + bentomodel = bentoml.Model.create( + llm.tag, + module=f'openllm.serialisation.{module}', + api_version=_api_version, + options=ModelOptions(), + context=openllm.utils.generate_context('openllm'), + labels=openllm.utils.generate_labels(llm), + metadata=_create_metadata(llm, config, safe_serialisation, trust_remote_code), + signatures=_create_signatures(llm), + ) + with openllm.utils.analytics.set_bentoml_tracking(): + try: + bentomodel.enter_cloudpickle_context(external_modules, imported_modules) + yield bentomodel, imported_modules + except Exception: + raise + else: + bentomodel.flush() + bentomodel.save(_model_store) + openllm.utils.analytics.track( + openllm.utils.analytics.ModelSaveEvent( + module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024 + ) + ) + finally: + bentomodel.exit_cloudpickle_context(imported_modules) + return bentomodel diff --git a/openllm-python/src/openllm/serialisation/_helpers.pyi b/openllm-python/src/openllm/serialisation/_helpers.pyi new file mode 100644 index 00000000..a4c5865c --- /dev/null +++ b/openllm-python/src/openllm/serialisation/_helpers.pyi @@ -0,0 +1,24 @@ +import types +from contextlib import contextmanager +from typing import Optional, Sequence + +import transformers + +from bentoml import Model +from openllm_core._typing_compat import M, T + +from .._llm import LLM + +def get_hash(config: transformers.PretrainedConfig) -> str: ... +def patch_correct_tag( + llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ... +) -> None: ... +@contextmanager +def save_model( + llm: LLM[M, T], + config: transformers.PretrainedConfig, + safe_serialisation: bool, + trust_remote_code: bool, + module: str, + external_modules: Sequence[types.ModuleType], +) -> Model: ... diff --git a/openllm-python/src/openllm/serialisation/ctranslate/__init__.py b/openllm-python/src/openllm/serialisation/ctranslate/__init__.py index e69de29b..471e4418 100644 --- a/openllm-python/src/openllm/serialisation/ctranslate/__init__.py +++ b/openllm-python/src/openllm/serialisation/ctranslate/__init__.py @@ -0,0 +1,100 @@ +import importlib +import logging +import shutil + +import transformers + +import bentoml +from openllm_core.exceptions import OpenLLMException +from openllm_core.utils import is_ctranslate_available + +from .._helpers import patch_correct_tag, save_model +from ..transformers._helpers import get_tokenizer, process_config + +if not is_ctranslate_available(): + raise RuntimeError( + "'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'" + ) + +import ctranslate2 +from ctranslate2.converters.transformers import TransformersConverter + +logger = logging.getLogger(__name__) + + +def _get_class(llm): + return ctranslate2.Translator if llm.config['model_type'] == 'seq2seq_lm' else ctranslate2.Generator + + +def import_model(llm, *decls, trust_remote_code, **attrs): + (_base_decls, _base_attrs), tokenizer_attrs = llm.llm_parameters + for it in {'device_map', 'torch_dtype'}: + _base_attrs.pop(it, None) # pop out hf-specific attributes + decls = (*_base_decls, *decls) + attrs = {**_base_attrs, **attrs} + low_cpu_mem_usage = attrs.pop('low_cpu_mem_usage', True) + logger.debug( + 'Note that CTranslate2 will load into memory for conversion. Refer to https://opennmt.net/CTranslate2/guides/transformers.html for more information.' + ) + if not llm._local: + logger.warning( + "It is RECOMMENDED to convert '%s' to CTranslate2 format yourself to utilise CTranslate2's features, then start with `openllm start /path/to/ct2-dir`. OpenLLM will conservely apply quantization for conversion if specified.", + llm.model_id, + ) + config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs) + patch_correct_tag(llm, config) + tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs) + with save_model( + llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)] + ) as save_metadata: + bentomodel, _ = save_metadata + if llm._local: + shutil.copytree( + llm.model_id, + bentomodel.path, + symlinks=False, + ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'), + dirs_exist_ok=True, + ) + else: + TransformersConverter( + llm.model_id, + load_as_float16=llm.quantise in ('float16', 'int8_float16'), + low_cpu_mem_usage=low_cpu_mem_usage, + trust_remote_code=trust_remote_code, + ).convert(bentomodel.path, quantization=llm.quantise, force=True) + # Save the original HF configuration to hf + config.save_pretrained(bentomodel.path_of('/hf/')) + tokenizer.save_pretrained(bentomodel.path) + return bentomodel + + +def get(llm): + try: + model = bentoml.models.get(llm.tag) + backend = model.info.labels['backend'] + if backend != llm.__llm_backend__: + raise OpenLLMException( + f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'." + ) + patch_correct_tag( + llm, + transformers.AutoConfig.from_pretrained(model.path_of('/hf/'), trust_remote_code=llm.trust_remote_code), + _revision=model.info.metadata.get('_revision'), + ) + return model + except Exception as err: + raise OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err + + +def load_model(llm, *decls, **attrs): + device = 'cuda' if llm._has_gpus else 'cpu' + if llm.quantise: + compute_type = llm.quantise + elif llm.__llm_dtype__ == 'half': + compute_type = 'float16' + elif llm.__llm_dtype__ == 'float': + compute_type = 'float32' + else: + compute_type = llm.__llm_dtype__ + return _get_class(llm)(llm.bentomodel.path, device=device, compute_type=compute_type) diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 09162049..b791f31a 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -1,22 +1,18 @@ -from __future__ import annotations import importlib import logging -import attr import orjson import torch import transformers from huggingface_hub import snapshot_download -from simple_di import Provide, inject import bentoml -import openllm -from bentoml._internal.configuration.containers import BentoMLContainer -from bentoml._internal.models.model import ModelOptions, ModelSignature from openllm_core.exceptions import OpenLLMException +from openllm_core.utils import first_not_none, is_autogptq_available -from ._helpers import get_hash, infer_autoclass_from_llm, process_config +from ._helpers import get_tokenizer, infer_autoclass_from_llm, process_config from .weights import HfIgnore +from .._helpers import patch_correct_tag, save_model logger = logging.getLogger(__name__) @@ -24,162 +20,56 @@ __all__ = ['import_model', 'get', 'load_model'] _object_setattr = object.__setattr__ -def _patch_correct_tag(llm, config, _revision=None): - # NOTE: The following won't hit during local since we generated a correct version based on local path hash It will only hit if we use model from HF Hub - if llm.revision is not None: - return - if not llm.local: - try: - if _revision is None: - _revision = get_hash(config) - except ValueError: - pass - if _revision is None and llm.tag.version is not None: - _revision = llm.tag.version - if llm.tag.version is None: - _object_setattr( - llm, '_tag', attr.evolve(llm.tag, version=_revision) - ) # HACK: This copies the correct revision into llm.tag - if llm._revision is None: - _object_setattr(llm, '_revision', _revision) # HACK: This copies the correct revision into llm._model_version - - -@inject -def import_model(llm, *decls, trust_remote_code, _model_store=Provide[BentoMLContainer.model_store], **attrs): - _base_decls, _base_attrs = llm.llm_parameters[0] +def import_model(llm, *decls, trust_remote_code, **attrs): + (_base_decls, _base_attrs), tokenizer_attrs = llm.llm_parameters decls = (*_base_decls, *decls) attrs = {**_base_attrs, **attrs} if llm._local: logger.warning('Given model is a local model, OpenLLM will load model into memory for serialisation.') config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs) - _patch_correct_tag(llm, config) - _, tokenizer_attrs = llm.llm_parameters - quantize = llm.quantise - safe_serialisation = openllm.utils.first_not_none( - attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors' - ) - metadata = {'safe_serialisation': safe_serialisation} - if quantize: - metadata['_quantize'] = quantize - architectures = getattr(config, 'architectures', []) - if not architectures: - if trust_remote_code: - auto_map = getattr(config, 'auto_map', {}) - if not auto_map: - raise RuntimeError( - f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}' - ) - autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM' - if autoclass not in auto_map: - raise RuntimeError( - f"Given model '{llm.model_id}' is yet to be supported with 'auto_map'. OpenLLM currently only support encoder-decoders or decoders only models." - ) - architectures = [auto_map[autoclass]] - else: - raise RuntimeError( - 'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`' - ) - metadata['_pretrained_class'] = architectures[0] - if not llm._local: - metadata['_revision'] = get_hash(config) - else: - metadata['_revision'] = llm.revision - - signatures = {} - - if quantize == 'gptq': - if not openllm.utils.is_autogptq_available(): - raise OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'" - ) - signatures['generate'] = {'batchable': False} - else: + patch_correct_tag(llm, config) + safe_serialisation = first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors') + if llm.quantise != 'gptq': attrs['use_safetensors'] = safe_serialisation - metadata['_framework'] = llm.__llm_backend__ - signatures.update( - { - k: ModelSignature(batchable=False) - for k in ( - '__call__', - 'forward', - 'generate', - 'contrastive_search', - 'greedy_search', - 'sample', - 'beam_search', - 'beam_sample', - 'group_beam_search', - 'constrained_beam_search', - ) - } - ) - - tokenizer = transformers.AutoTokenizer.from_pretrained( - llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs - ) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token model = None - external_modules = [importlib.import_module(tokenizer.__module__)] - imported_modules = [] - bentomodel = bentoml.Model.create( - llm.tag, - module='openllm.serialisation.transformers', - api_version='v2.1.0', - options=ModelOptions(), - context=openllm.utils.generate_context(framework_name='openllm'), - labels=openllm.utils.generate_labels(llm), - metadata=metadata, - signatures=signatures, - ) - with openllm.utils.analytics.set_bentoml_tracking(): - try: - bentomodel.enter_cloudpickle_context(external_modules, imported_modules) - tokenizer.save_pretrained(bentomodel.path) - if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}): - attrs['quantization_config'] = llm.quantization_config - if quantize == 'gptq': - from optimum.gptq.constants import GPTQ_CONFIG + tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs) + with save_model( + llm, config, safe_serialisation, trust_remote_code, 'transformers', [importlib.import_module(tokenizer.__module__)] + ) as save_metadata: + bentomodel, imported_modules = save_metadata + tokenizer.save_pretrained(bentomodel.path) + if llm._quantization_config or (llm.quantise and llm.quantise not in {'squeezellm', 'awq'}): + attrs['quantization_config'] = llm.quantization_config + if llm.quantise == 'gptq': + from optimum.gptq.constants import GPTQ_CONFIG - with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f: - f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()) - if llm._local: # possible local path - model = infer_autoclass_from_llm(llm, config).from_pretrained( - llm.model_id, - *decls, - local_files_only=True, - config=config, - trust_remote_code=trust_remote_code, - **hub_attrs, - **attrs, - ) - # for trust_remote_code to work - bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules) - model.save_pretrained(bentomodel.path, max_shard_size='2GB', safe_serialization=safe_serialisation) - del model - if torch.cuda.is_available(): - torch.cuda.empty_cache() - else: - # we will clone the all tings into the bentomodel path without loading model into memory - snapshot_download( - llm.model_id, - local_dir=bentomodel.path, - local_dir_use_symlinks=False, - ignore_patterns=HfIgnore.ignore_patterns(llm), - ) - except Exception: - raise - else: - bentomodel.flush() # type: ignore[no-untyped-call] - bentomodel.save(_model_store) - openllm.utils.analytics.track( - openllm.utils.analytics.ModelSaveEvent( - module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024 - ) + with open(bentomodel.path_of(GPTQ_CONFIG), 'w', encoding='utf-8') as f: + f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()) + if llm._local: # possible local path + model = infer_autoclass_from_llm(llm, config).from_pretrained( + llm.model_id, + *decls, + local_files_only=True, + config=config, + trust_remote_code=trust_remote_code, + **hub_attrs, + **attrs, + ) + # for trust_remote_code to work + bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules) + model.save_pretrained(bentomodel.path, max_shard_size='2GB', safe_serialization=safe_serialisation) + del model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + else: + # we will clone the all tings into the bentomodel path without loading model into memory + snapshot_download( + llm.model_id, + local_dir=bentomodel.path, + local_dir_use_symlinks=False, + ignore_patterns=HfIgnore.ignore_patterns(llm), ) - finally: - bentomodel.exit_cloudpickle_context(imported_modules) return bentomodel @@ -191,7 +81,7 @@ def get(llm): raise OpenLLMException( f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'." ) - _patch_correct_tag( + patch_correct_tag( llm, transformers.AutoConfig.from_pretrained(model.path, trust_remote_code=llm.trust_remote_code), _revision=model.info.metadata.get('_revision'), @@ -226,7 +116,7 @@ def load_model(llm, *decls, **attrs): if '_quantize' in llm.bentomodel.info.metadata: _quantise = llm.bentomodel.info.metadata['_quantize'] if _quantise == 'gptq': - if not openllm.utils.is_autogptq_available(): + if not is_autogptq_available(): raise OpenLLMException( "GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'" ) diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index f3ceb980..5664e1ea 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -11,11 +11,13 @@ from openllm_core.utils import get_disable_warnings, get_quiet_mode logger = logging.getLogger(__name__) -def get_hash(config: transformers.PretrainedConfig) -> str: - _commit_hash = getattr(config, '_commit_hash', None) - if _commit_hash is None: - raise ValueError(f'Cannot find commit hash in {config}') - return _commit_hash +def get_tokenizer(model_id_or_path, trust_remote_code, **attrs): + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_id_or_path, trust_remote_code=trust_remote_code, **attrs + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + return tokenizer def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any): diff --git a/openllm-python/src/openllm/utils/__init__.pyi b/openllm-python/src/openllm/utils/__init__.pyi index 4ec387f7..ae535687 100644 --- a/openllm-python/src/openllm/utils/__init__.pyi +++ b/openllm-python/src/openllm/utils/__init__.pyi @@ -34,6 +34,7 @@ from openllm_core.utils import ( is_autogptq_available as is_autogptq_available, is_bentoml_available as is_bentoml_available, is_bitsandbytes_available as is_bitsandbytes_available, + is_ctranslate_available as is_ctranslate_available, is_grpc_available as is_grpc_available, is_jupyter_available as is_jupyter_available, is_jupytext_available as is_jupytext_available, diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py index 374b664c..b7bd68ed 100644 --- a/openllm-python/src/openllm_cli/_factory.py +++ b/openllm-python/src/openllm_cli/_factory.py @@ -17,7 +17,6 @@ from openllm_core._typing_compat import ( Concatenate, DictStrAny, LiteralBackend, - LiteralQuantise, LiteralSerialisation, ParamSpec, get_literal_args, @@ -289,10 +288,10 @@ def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[ def dtype_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( '--dtype', - type=click.Choice(['float16', 'float32', 'bfloat16', 'auto']), + type=str, envvar='TORCH_DTYPE', default='auto', - help='Optional dtype for casting tensors for running inference.', + help="Optional dtype for casting tensors for running inference ['float16', 'float32', 'bfloat16', 'int8', 'int16']. For CTranslate2, it also accepts the following ['int8_float32', 'int8_float16', 'int8_bfloat16']", **attrs, )(f) @@ -341,15 +340,13 @@ def prompt_template_file_option(f: _AnyCallable | None = None, **attrs: t.Any) - def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip - # XXX: remove the check for __args__ once we have ggml and mlc supports return cli_option( '--backend', - type=click.Choice(get_literal_args(LiteralBackend)[:2]), + type=click.Choice(get_literal_args(LiteralBackend)), default=None, envvar='OPENLLM_BACKEND', show_envvar=True, - help='The implementation for saving this LLM.', + help='Runtime to use for both serialisation/inference engine.', **attrs, )(f) @@ -368,7 +365,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att '--quantise', '--quantize', 'quantize', - type=click.Choice(get_literal_args(LiteralQuantise)), + type=str, default=None, envvar='OPENLLM_QUANTIZE', show_envvar=True, @@ -382,6 +379,10 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att - ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323) + - ``awq``: ``AWQ`` [AWQ: Activation-aware Weight Quantization](https://arxiv.org/abs/2306.00978) + + - ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629) + > [!NOTE] that the model can also be served with quantized weights. """ + ( diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py index 3b4fd631..555003cf 100644 --- a/openllm-python/src/openllm_cli/_sdk.py +++ b/openllm-python/src/openllm_cli/_sdk.py @@ -85,9 +85,7 @@ def _start( """ from .entrypoint import start_command, start_grpc_command - os.environ['OPENLLM_BACKEND'] = openllm_core.utils.first_not_none( - backend, default='vllm' if is_vllm_available() else 'pt' - ) + os.environ['BACKEND'] = openllm_core.utils.first_not_none(backend, default='vllm' if is_vllm_available() else 'pt') args: list[str] = [model_id] if system_message: diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py index ea388da1..3c99d544 100644 --- a/openllm-python/src/openllm_cli/entrypoint.py +++ b/openllm-python/src/openllm_cli/entrypoint.py @@ -450,7 +450,7 @@ def start_command( import torch - if not torch.cuda.is_available(): + if backend == 'pt' and not torch.cuda.is_available(): if dtype == 'auto': dtype = 'float' elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode(): @@ -465,7 +465,7 @@ def start_command( adapter_map=adapter_map, quantize=quantize, serialisation=serialisation, - torch_dtype=dtype, + dtype=dtype, ) backend_warning(llm.__llm_backend__) @@ -580,7 +580,7 @@ def start_grpc_command( import torch - if not torch.cuda.is_available(): + if backend == 'pt' and not torch.cuda.is_available(): if dtype == 'auto': dtype = 'float' elif dtype not in {'float', 'float32'} and not get_disable_warnings() and not get_quiet_mode(): @@ -595,7 +595,7 @@ def start_grpc_command( adapter_map=adapter_map, quantize=quantize, serialisation=serialisation, - torch_dtype=dtype, + dtype=dtype, trust_remote_code=check_bool_env('TRUST_REMOTE_CODE'), ) backend_warning(llm.__llm_backend__) @@ -661,14 +661,14 @@ def process_environ( 'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()), 'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(), 'OPENLLM_SERIALIZATION': serialisation, - 'OPENLLM_BACKEND': llm.__llm_backend__, 'OPENLLM_CONFIG': config.model_dump_json(flatten=True).decode(), - 'TORCH_DTYPE': str(llm._torch_dtype).split('.')[-1], + 'BACKEND': llm.__llm_backend__, + 'DTYPE': str(llm._torch_dtype).split('.')[-1], 'TRUST_REMOTE_CODE': str(llm.trust_remote_code), } ) if llm.quantise: - environ['OPENLLM_QUANTIZE'] = str(llm.quantise) + environ['QUANTIZE'] = str(llm.quantise) if system_message: environ['OPENLLM_SYSTEM_MESSAGE'] = system_message if prompt_template: @@ -695,10 +695,11 @@ def process_workers_per_resource(wpr: str | float | int, device: tuple[str, ...] def build_bento_instruction(llm, model_id, serialisation, adapter_map): - cmd_name = f'openllm build {model_id}' + cmd_name = f'openllm build {model_id} --backend {llm.__llm_backend__}' if llm.quantise: cmd_name += f' --quantize {llm.quantise}' - cmd_name += f' --serialization {serialisation}' + if llm.__llm_backend__ in {'pt', 'vllm'}: + cmd_name += f' --serialization {serialisation}' if adapter_map is not None: cmd_name += ' ' + ' '.join( [ @@ -1042,7 +1043,7 @@ def build_command( system_message=system_message, backend=backend, quantize=quantize, - torch_dtype=dtype, + dtype=dtype, serialisation=first_not_none( serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy' ), diff --git a/openllm-python/src/openllm_cli/playground/falcon_tuned.py b/openllm-python/src/openllm_cli/playground/falcon_tuned.py index ff603267..83a6a7be 100644 --- a/openllm-python/src/openllm_cli/playground/falcon_tuned.py +++ b/openllm-python/src/openllm_cli/playground/falcon_tuned.py @@ -61,8 +61,8 @@ else: llm = openllm.LLM( model_args.model_id, quantize='int4', bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.float16 ) -model, tokenizer = llm.prepare_for_training( - adapter_type='lora', +model, tokenizer = llm.prepare( + 'lora', lora_alpha=16, lora_dropout=0.1, r=16, diff --git a/openllm-python/src/openllm_cli/playground/llama2_qlora.py b/openllm-python/src/openllm_cli/playground/llama2_qlora.py index 6c33e759..a1c0ae1f 100644 --- a/openllm-python/src/openllm_cli/playground/llama2_qlora.py +++ b/openllm-python/src/openllm_cli/playground/llama2_qlora.py @@ -135,9 +135,7 @@ def prepare_for_int4_training( modules = find_all_linear_names(llm.model) print(f'Found {len(modules)} modules to quantize: {modules}') - model, tokenizer = llm.prepare_for_training( - adapter_type='lora', use_gradient_checkpointing=gradient_checkpointing, target_modules=modules - ) + model, tokenizer = llm.prepare('lora', use_gradient_checkpointing=gradient_checkpointing, target_modules=modules) # pre-process the model by upcasting the layer norms in float 32 for for name, module in model.named_modules(): diff --git a/openllm-python/src/openllm_cli/playground/opt_tuned.py b/openllm-python/src/openllm_cli/playground/opt_tuned.py index dbeb8ffb..03b239dc 100644 --- a/openllm-python/src/openllm_cli/playground/opt_tuned.py +++ b/openllm-python/src/openllm_cli/playground/opt_tuned.py @@ -65,8 +65,8 @@ else: model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()) llm = openllm.LLM(model_args.model_id, quantize='int8') -model, tokenizer = llm.prepare_for_training( - adapter_type='lora', r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj'], lora_dropout=0.05, bias='none' +model, tokenizer = llm.prepare( + 'lora', r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj'], lora_dropout=0.05, bias='none' ) # ft on english_quotes diff --git a/tools/update-readme.py b/tools/update-readme.py index e8cb8ae6..c3ed2423 100755 --- a/tools/update-readme.py +++ b/tools/update-readme.py @@ -94,6 +94,25 @@ openllm query 'What are large language models?' ```""", ] ) + if 'ctranslate' in it['backend']: + details_block.extend( + [ + '\n- CTranslate2 (*experimental*):\n\n', + f"""\ +```bash +{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['model_ids'][0]} --backend ctranslate +```""", + *markdown_noteblock( + 'Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16' + ), + *markdown_noteblock( + 'We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.' + ), + *markdown_importantblock( + 'CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.' + ), + ] + ) details_block.append('\n
\n\n') content.append('\n'.join(details_block))