diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py index cc7b1638..33ae1c07 100644 --- a/src/openllm/__init__.py +++ b/src/openllm/__init__.py @@ -34,7 +34,7 @@ _import_structure = { "cli": [], "configuration_utils": ["LLMConfig"], "exceptions": [], - "runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"], + "runner_utils": ["LLMRunner", "LLMRunnable"], "schema": ["PromptTemplate"], "server_utils": ["start", "start_grpc"], "types": [], @@ -42,13 +42,7 @@ _import_structure = { "models": [], "client": [], # NOTE: models - "models.auto": [ - "AutoConfig", - "CONFIG_MAPPING", - "AutoTokenizer", - "TOKENIZER_MAPPING", - "TOKENIZER_MAPPING_NAMES", - ], + "models.auto": ["AutoConfig", "CONFIG_MAPPING"], "models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"], } @@ -58,17 +52,8 @@ try: except MissingDependencyError: pass else: - _import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]) - _import_structure["models.auto"].extend( - [ - "AutoLLM", - "AutoLLMWithTokenizer", - "MODEL_MAPPING_NAMES", - "MODEL_WITH_TOKENIZER_MAPPING_NAMES", - "MODEL_MAPPING", - "MODEL_WITH_TOKENIZER_MAPPING", - ] - ) + _import_structure["models.flan_t5"].extend(["FlanT5"]) + _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]) try: if not imports.is_flax_available(): @@ -76,17 +61,8 @@ try: except MissingDependencyError: pass else: - _import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]) - _import_structure["models.auto"].extend( - [ - "AutoFlaxLLM", - "AutoFlaxLLMWithTokenizer", - "MODEL_FLAX_MAPPING_NAMES", - "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES", - "MODEL_FLAX_MAPPING", - "MODEL_FLAX_WITH_TOKENIZER_MAPPING", - ] - ) + _import_structure["models.flan_t5"].extend(["FlaxFlanT5"]) + _import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"]) try: if not imports.is_tf_available(): @@ -94,17 +70,8 @@ try: except MissingDependencyError: pass else: - _import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"]) - _import_structure["models.auto"].extend( - [ - "AutoTFLLM", - "AutoTFLLMWithTokenizer", - "MODEL_TF_MAPPING_NAMES", - "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES", - "MODEL_TF_MAPPING", - "MODEL_TF_WITH_TOKENIZER_MAPPING", - ] - ) + _import_structure["models.flan_t5"].extend(["TFFlanT5"]) + _import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"]) # declaration for OpenLLM-related modules @@ -123,17 +90,12 @@ if t.TYPE_CHECKING: # Specific types import from .configuration_utils import LLMConfig as LLMConfig from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING - from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING - from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES from .models.auto import AutoConfig as AutoConfig - from .models.auto import AutoTokenizer as AutoTokenizer from .models.flan_t5 import \ START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING from .models.flan_t5 import FlanT5Config as FlanT5Config from .runner_utils import LLMRunnable as LLMRunnable from .runner_utils import LLMRunner as LLMRunner - from .runner_utils import \ - generate_tokenizer_runner as generate_tokenizer_runner from .schema import PromptTemplate as PromptTemplate from .server_utils import start as start from .server_utils import start_grpc as start_grpc @@ -146,16 +108,8 @@ if t.TYPE_CHECKING: else: from .models.auto import MODEL_MAPPING as MODEL_MAPPING from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES - from .models.auto import \ - MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING - from .models.auto import \ - MODEL_WITH_TOKENIZER_MAPPING_NAMES as \ - MODEL_WITH_TOKENIZER_MAPPING_NAMES from .models.auto import AutoLLM as AutoLLM - from .models.auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer from .models.flan_t5 import FlanT5 as FlanT5 - from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer - from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer try: if not imports.is_flax_available(): @@ -166,18 +120,8 @@ if t.TYPE_CHECKING: from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING from .models.auto import \ MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES - from .models.auto import \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING as \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING - from .models.auto import \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES from .models.auto import AutoFlaxLLM as AutoFlaxLLM - from .models.auto import \ - AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5 - from .models.flan_t5 import \ - FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer try: if not imports.is_tf_available(): @@ -188,17 +132,8 @@ if t.TYPE_CHECKING: from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING from .models.auto import \ MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES - from .models.auto import \ - MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING - from .models.auto import \ - MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \ - MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES from .models.auto import AutoTFLLM as AutoTFLLM - from .models.auto import \ - AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer from .models.flan_t5 import TFFlanT5 as TFFlanT5 - from .models.flan_t5 import \ - TFFlanT5WithTokenizer as TFFlanT5WithTokenizer else: import sys diff --git a/src/openllm/models/auto/__init__.py b/src/openllm/models/auto/__init__.py index 53bd82b2..1ea6f531 100644 --- a/src/openllm/models/auto/__init__.py +++ b/src/openllm/models/auto/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc.""" +"""This module is derived from HuggingFace's AutoConfig, AutoModel, etc.""" from __future__ import annotations @@ -25,7 +25,6 @@ from ...utils import import_utils_shim as imports _import_structure = { "configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"], - "tokenization_auto": ["AutoTokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"], } try: @@ -34,14 +33,7 @@ try: except openllm.exceptions.MissingDependencyError: pass else: - _import_structure["modeling_auto"] = [ - "AutoLLM", - "AutoLLMWithTokenizer", - "MODEL_MAPPING_NAMES", - "MODEL_WITH_TOKENIZER_MAPPING_NAMES", - "MODEL_MAPPING", - "MODEL_WITH_TOKENIZER_MAPPING", - ] + _import_structure["modeling_auto"] = ["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"] try: if not imports.is_flax_available(): @@ -49,14 +41,7 @@ try: except openllm.exceptions.MissingDependencyError: pass else: - _import_structure["modeling_flax_auto"] = [ - "AutoFlaxLLM", - "AutoFlaxLLMWithTokenizer", - "MODEL_FLAX_MAPPING_NAMES", - "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES", - "MODEL_FLAX_MAPPING", - "MODEL_FLAX_WITH_TOKENIZER_MAPPING", - ] + _import_structure["modeling_flax_auto"] = ["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"] try: if not imports.is_tf_available(): @@ -64,24 +49,13 @@ try: except openllm.exceptions.MissingDependencyError: pass else: - _import_structure["modeling_tf_auto"] = [ - "AutoTFLLM", - "AutoTFLLMWithTokenizer", - "MODEL_TF_MAPPING_NAMES", - "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES", - "MODEL_TF_MAPPING", - "MODEL_TF_WITH_TOKENIZER_MAPPING", - ] + _import_structure["modeling_tf_auto"] = ["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"] if t.TYPE_CHECKING: from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING from .configuration_auto import \ CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES from .configuration_auto import AutoConfig as AutoConfig - from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING - from .tokenization_auto import \ - TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES - from .tokenization_auto import AutoTokenizer as AutoTokenizer try: if not imports.is_torch_available(): @@ -91,13 +65,7 @@ if t.TYPE_CHECKING: else: from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES - from .modeling_auto import \ - MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING - from .modeling_auto import \ - MODEL_WITH_TOKENIZER_MAPPING_NAMES as \ - MODEL_WITH_TOKENIZER_MAPPING_NAMES from .modeling_auto import AutoLLM as AutoLLM - from .modeling_auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer try: if not imports.is_flax_available(): @@ -109,15 +77,7 @@ if t.TYPE_CHECKING: MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING from .modeling_flax_auto import \ MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES - from .modeling_flax_auto import \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING as \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING - from .modeling_flax_auto import \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \ - MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES from .modeling_flax_auto import AutoFlaxLLM as AutoFlaxLLM - from .modeling_flax_auto import \ - AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer try: if not imports.is_tf_available(): @@ -128,14 +88,7 @@ if t.TYPE_CHECKING: from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING from .modeling_tf_auto import \ MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES - from .modeling_tf_auto import \ - MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING - from .modeling_tf_auto import \ - MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \ - MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES from .modeling_tf_auto import AutoTFLLM as AutoTFLLM - from .modeling_tf_auto import \ - AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer else: import sys diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py index 0a21c4de..347059b8 100644 --- a/src/openllm/models/auto/factory.py +++ b/src/openllm/models/auto/factory.py @@ -23,7 +23,8 @@ import openllm from .configuration_auto import AutoConfig -def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable[t.Any, t.Any]]: + +def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable]: supported_runnables = runnable_mapping[type(config)] if not isinstance(supported_runnables, (list, tuple)): return supported_runnables @@ -72,7 +73,7 @@ class _BaseAutoRunnerFactory: ) @classmethod - def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]): + def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable]): """ Register a new model for this class. diff --git a/src/openllm/models/auto/modeling_auto.py b/src/openllm/models/auto/modeling_auto.py index 28478dc9..cdea672f 100644 --- a/src/openllm/models/auto/modeling_auto.py +++ b/src/openllm/models/auto/modeling_auto.py @@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")]) -MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")]) - MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) -MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES) - class AutoLLM(_BaseAutoRunnerFactory): _model_mapping = MODEL_MAPPING - - -class AutoLLMWithTokenizer(_BaseAutoRunnerFactory): - _model_mapping = MODEL_WITH_TOKENIZER_MAPPING diff --git a/src/openllm/models/auto/modeling_flax_auto.py b/src/openllm/models/auto/modeling_flax_auto.py index 3429b902..dfb8e087 100644 --- a/src/openllm/models/auto/modeling_flax_auto.py +++ b/src/openllm/models/auto/modeling_flax_auto.py @@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")]) -MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")]) - MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES) -MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES) - class AutoFlaxLLM(_BaseAutoRunnerFactory): _model_mapping = MODEL_FLAX_MAPPING - - -class AutoFlaxLLMWithTokenizer(_BaseAutoRunnerFactory): - _model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING diff --git a/src/openllm/models/auto/modeling_tf_auto.py b/src/openllm/models/auto/modeling_tf_auto.py index dbb3c762..668bfe3c 100644 --- a/src/openllm/models/auto/modeling_tf_auto.py +++ b/src/openllm/models/auto/modeling_tf_auto.py @@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")]) -MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")]) - -MODEL_TF_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES) - -MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES) +MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES) class AutoTFLLM(_BaseAutoRunnerFactory): _model_mapping = MODEL_TF_MAPPING - - -class AutoTFLLMWithTokenizer(_BaseAutoRunnerFactory): - _model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING diff --git a/src/openllm/models/auto/tokenization_auto.py b/src/openllm/models/auto/tokenization_auto.py deleted file mode 100644 index a7ebfeac..00000000 --- a/src/openllm/models/auto/tokenization_auto.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2023 BentoML Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import typing as t -from collections import OrderedDict - -import openllm - -from .configuration_auto import _LazyConfigMapping - -TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")]) - -TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES) - - -class AutoTokenizer: - def __init__(self): - raise EnvironmentError( - "This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead" - ) - - @classmethod - def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any): - model_name = openllm.utils.kebab_to_snake_case(model_name) - if model_name in TOKENIZER_MAPPING: - tokenizer_class = TOKENIZER_MAPPING[model_name] - if pretrained_or_path is None: - pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name) - return tokenizer_class(pretrained_or_path, **kwargs) - raise ValueError( - f"Unrecognized model {model_name} to build an Tokenizer.\n" - f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}." - ) diff --git a/src/openllm/models/chatglm/__init__.py b/src/openllm/models/chatglm/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/chatglm/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/models/dolly_v2/__init__.py b/src/openllm/models/dolly_v2/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/dolly_v2/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/models/flan_t5/__init__.py b/src/openllm/models/flan_t5/__init__.py index 567e8c8d..b8e52a44 100644 --- a/src/openllm/models/flan_t5/__init__.py +++ b/src/openllm/models/flan_t5/__init__.py @@ -21,7 +21,7 @@ from openllm.utils import import_utils_shim as imports _import_structure = { "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"], - "service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"], + "service_flan_t5": ["svc", "model_runner", "generate"], } try: @@ -30,7 +30,7 @@ try: except openllm.exceptions.MissingDependencyError: pass else: - _import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"] + _import_structure["modeling_flan_t5"] = ["FlanT5"] try: if not imports.is_flax_available(): @@ -38,7 +38,7 @@ try: except openllm.exceptions.MissingDependencyError: pass else: - _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"] + _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"] try: if not imports.is_tf_available(): @@ -46,7 +46,7 @@ try: except openllm.exceptions.MissingDependencyError: pass else: - _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"] + _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5"] if t.TYPE_CHECKING: @@ -55,6 +55,7 @@ if t.TYPE_CHECKING: from .configuration_flan_t5 import \ START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING from .configuration_flan_t5 import FlanT5Config as FlanT5Config + from .service_flan_t5 import svc as svc try: if not imports.is_torch_available(): @@ -63,9 +64,6 @@ if t.TYPE_CHECKING: pass else: from .modeling_flan_t5 import FlanT5 as FlanT5 - from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer - from .modeling_flan_t5 import \ - FlanT5WithTokenizer as FlanT5WithTokenizer try: if not imports.is_flax_available(): @@ -74,8 +72,6 @@ if t.TYPE_CHECKING: pass else: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5 - from .modeling_flax_flan_t5 import \ - FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer try: if not imports.is_tf_available(): @@ -84,8 +80,6 @@ if t.TYPE_CHECKING: pass else: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5 - from .modeling_tf_flan_t5 import \ - TFFlanT5WithTokenizer as TFFlanT5WithTokenizer else: import sys diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py index fd1d0c54..0758d379 100644 --- a/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -17,8 +17,7 @@ import typing as t import openllm -from ...runner_utils import (LLMRunnable, assign_start_model_name, - generate_tokenizer_runner) +from ...runner_utils import LLMRunnable from .configuration_flan_t5 import FlanT5Config if t.TYPE_CHECKING: @@ -62,37 +61,11 @@ def import_model( return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer}) -def _FlanT5Tokenizer( - pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any -) -> openllm.types.TokenizerRunner: - """Get the runner for the tokenizer. - - Args: - model_name: The name of the FLAN-T5 model to import. - embedded: Whether to use the embedded runner or not. - **kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors. - - Returns: - The runner for the tokenizer. - """ - if pretrained_or_path is None: - pretrained_or_path = FlanT5.default_model - - return generate_tokenizer_runner( - import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded - ) - - -FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer) - - -class FlanT5( - LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5" -): +class FlanT5(LLMRunnable, start_model_name="flan-t5"): default_model: str = "google/flan-t5-large" config_class = FlanT5Config - ATTACH_TOKENIZER = False + ATTACH_TOKENIZER = True _llm_config: FlanT5Config @@ -106,9 +79,10 @@ class FlanT5( "google/flan-t5-xxl", ] + @torch.inference_mode() def _generate( self, - input_ids: torch.Tensor, + prompt: str, max_length: int | None = None, do_sample: bool = True, temperature: float | None = None, @@ -117,7 +91,9 @@ class FlanT5( repetition_penalty: float | None = None, **kwargs: t.Any, ) -> torch.Tensor: - return self.model.generate( + input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids + input_ids = input_ids.to(self.device) + outputs = self.model.generate( input_ids, max_length=max_length if max_length is not None else self._llm_config.max_length, do_sample=do_sample, @@ -129,15 +105,4 @@ class FlanT5( else self._llm_config.repetition_penalty, **kwargs, ) - - -class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"): - default_model: str = "google/flan-t5-large" - - ATTACH_TOKENIZER = True - - def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]: - input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids - input_ids = input_ids.to(self.device) - outputs = super()._generate(input_ids, **kwargs) return self.tokenizer.batch_decode(outputs, skip_special_tokens=True) diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py index 4fb1abd8..b742241d 100644 --- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py @@ -61,13 +61,11 @@ def import_model( return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer}) -class FlaxFlanT5( - LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5" -): +class FlaxFlanT5(LLMRunnable, start_model_name="flan-t5"): default_model: str = "google/flan-t5-large" config_class = FlanT5Config - ATTACH_TOKENIZER = False + ATTACH_TOKENIZER = True _llm_config: FlanT5Config @@ -81,7 +79,7 @@ class FlaxFlanT5( def _generate( self, - input_ids: jnp.ndarray, + prompt: str, max_length: int | None = None, do_sample: bool = True, temperature: float | None = None, @@ -90,7 +88,8 @@ class FlaxFlanT5( repetition_penalty: float | None = None, **kwargs: t.Any, ) -> jnp.ndarray: - return self.model.generate( + input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"] + outputs = self.model.generate( input_ids, max_length=max_length if max_length is not None else self._llm_config.max_length, do_sample=do_sample, @@ -102,16 +101,6 @@ class FlaxFlanT5( else self._llm_config.repetition_penalty, **kwargs, ) - - -class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"): - default_model: str = "google/flan-t5-large" - - ATTACH_TOKENIZER = True - - def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]: - input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"] - outputs = super()._generate(input_ids, **kwargs) return self.tokenizer.batch_decode( outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True ) diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py index da0a999a..6dd45e5e 100644 --- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py +++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py @@ -60,13 +60,11 @@ def import_model( return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer}) -class TFFlanT5( - LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5" -): +class TFFlanT5(LLMRunnable, start_model_name="flan-t5"): default_model: str = "google/flan-t5-large" config_class = FlanT5Config - ATTACH_TOKENIZER = False + ATTACH_TOKENIZER = True _llm_config: FlanT5Config @@ -80,7 +78,7 @@ class TFFlanT5( def _generate( self, - input_ids: tf.Tensor, + prompt: str, max_length: int | None = None, do_sample: bool = True, temperature: float | None = None, @@ -89,7 +87,8 @@ class TFFlanT5( repetition_penalty: float | None = None, **kwargs: t.Any, ) -> tf.Tensor: - return self.model.generate( + input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids + outputs = self.model.generate( input_ids, max_length=max_length if max_length is not None else self._llm_config.max_length, do_sample=do_sample, @@ -101,14 +100,4 @@ class TFFlanT5( else self._llm_config.repetition_penalty, **kwargs, ) - - -class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"): - default_model: str = "google/flan-t5-large" - - ATTACH_TOKENIZER = True - - def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]: - input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids - outputs = super()._generate(input_ids, **kwargs) return self.tokenizer.batch_decode(outputs, skip_special_tokens=True) diff --git a/src/openllm/models/flan_t5/service_flan_t5.py b/src/openllm/models/flan_t5/service_flan_t5.py index ccc54f80..361e249e 100644 --- a/src/openllm/models/flan_t5/service_flan_t5.py +++ b/src/openllm/models/flan_t5/service_flan_t5.py @@ -31,9 +31,8 @@ else: raise ValueError(f"Invalid framework {framework}") model_runner = klass.create_runner("flan-t5") -tokenizer_runner = openllm.AutoTokenizer.create_runner("flan-t5") -svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner]) +svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner]) @svc.api( @@ -43,17 +42,7 @@ svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), ru async def generate(qa: openllm.schema.GenerateInput) -> openllm.schema.GenerateOutput: """Returns the generated text from given prompts.""" llm_config = model_runner.llm_config.with_options(**qa.llm_config).dict() - - return_tensors = "np" if framework == "flax" else framework - input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors) - if framework == "flax": - outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config) - responses = await tokenizer_runner.batch_decode.async_run( - outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True - ) - else: - outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config) - responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True) + responses = await model_runner.generate.async_run(qa.prompt, **llm_config) return openllm.schema.GenerateOutput(responses=responses, configuration=llm_config) diff --git a/src/openllm/models/gpt_neox/__init__.py b/src/openllm/models/gpt_neox/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/gpt_neox/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/models/gptj/__init__.py b/src/openllm/models/gptj/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/gptj/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/models/llama/__init__.py b/src/openllm/models/llama/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/llama/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/models/roberta/__init__.py b/src/openllm/models/roberta/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/roberta/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/models/stablelm/__init__.py b/src/openllm/models/stablelm/__init__.py new file mode 100644 index 00000000..249445f3 --- /dev/null +++ b/src/openllm/models/stablelm/__init__.py @@ -0,0 +1 @@ +raise NotImplementedError("This module is not implemented yet.") diff --git a/src/openllm/runner_utils.py b/src/openllm/runner_utils.py index 3b81c95b..578c29e6 100644 --- a/src/openllm/runner_utils.py +++ b/src/openllm/runner_utils.py @@ -44,9 +44,6 @@ else: logger = logging.getLogger(__name__) -M = t.TypeVar("M") -T = t.TypeVar("T") - def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]: def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]: @@ -156,7 +153,7 @@ class BaseLLMRunnable(bentoml.Runnable, ABC): # TODO: Add support for model validation -class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]): +class LLMRunnable(BaseLLMRunnable): # The section below defines a loose contract with langchain's LLM interface. @property def _llm_type(self) -> str: @@ -173,8 +170,8 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]): # XXX: INTERNAL _module: LLMModuleType - _model: M | None = None - _tokenizer: T | None = None + _model: t.Any | None = None + _tokenizer: t.Any | None = None def __setattr__(self, attr_name: str, value: t.Any) -> None: if attr_name in ("ATTACH_TOKENIZER",): @@ -249,18 +246,18 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]): return super().__getattribute__(item) @classmethod - def dummy_object(cls) -> LLMRunnable[M, T]: + def dummy_object(cls) -> LLMRunnable: return cls(_dummy=True, _internal=True) @property - def model(self) -> M: + def model(self) -> t.Any: # NOTE: should we have support for nested runner here? if self._model is None: self._model = self._bentomodel.load_model() return self._model @property - def tokenizer(self) -> T: + def tokenizer(self) -> t.Any: # This is the runner generated from the bento model. This can # then be used for implementation of _generate. if self._tokenizer is None: @@ -368,7 +365,7 @@ class LLMRunner(bentoml.Runner): def __init__( self, - runnable_class: type[LLMRunnable[t.Any, t.Any]], + runnable_class: type[LLMRunnable], llm_config: LLMConfig, **kwargs: t.Any, ): diff --git a/src/openllm/types.py b/src/openllm/types.py index dab968c6..bb1c1acb 100644 --- a/src/openllm/types.py +++ b/src/openllm/types.py @@ -48,14 +48,10 @@ class LLMModuleType(LazyLoader): ) -> bentoml.Model: ... - class LLMConfigImpl(LLMConfig): + class LLMConfigImpl(LLMConfig, model_name="dummy"): ... - class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"): - ... - - @staticmethod - def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner: + class LLMRunnableImpl(LLMRunnable, start_model_name="dummy"): ...