infra: add structure and cleanup separation of tokenizer

since tokenizer are relatively light, all default LLM will bundle the tokenizer with itself. Maybe we can put the tokenizer in its own runner in the future Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-05-05 06:12:43 -04:00 · 2023-05-05 11:57:39 -07:00
parent 426a61713f
commit 2a53faee9c
21 changed files with 57 additions and 312 deletions
--- a/src/openllm/init.py
+++ b/src/openllm/init.py
@@ -34,7 +34,7 @@ _import_structure = {
    "cli": [],
    "configuration_utils": ["LLMConfig"],
    "exceptions": [],
-    "runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"],
+    "runner_utils": ["LLMRunner", "LLMRunnable"],
    "schema": ["PromptTemplate"],
    "server_utils": ["start", "start_grpc"],
    "types": [],
@@ -42,13 +42,7 @@ _import_structure = {
    "models": [],
    "client": [],
    # NOTE: models
-    "models.auto": [
-        "AutoConfig",
-        "CONFIG_MAPPING",
-        "AutoTokenizer",
-        "TOKENIZER_MAPPING",
-        "TOKENIZER_MAPPING_NAMES",
-    ],
+    "models.auto": ["AutoConfig", "CONFIG_MAPPING"],
    "models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
 }

@@ -58,17 +52,8 @@ try:
 except MissingDependencyError:
    pass
 else:
-    _import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoLLM",
-            "AutoLLMWithTokenizer",
-            "MODEL_MAPPING_NAMES",
-            "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_MAPPING",
-            "MODEL_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["FlanT5"])
+    _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])

 try:
    if not imports.is_flax_available():
@@ -76,17 +61,8 @@ try:
 except MissingDependencyError:
    pass
 else:
-    _import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoFlaxLLM",
-            "AutoFlaxLLMWithTokenizer",
-            "MODEL_FLAX_MAPPING_NAMES",
-            "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_FLAX_MAPPING",
-            "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
+    _import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"])

 try:
    if not imports.is_tf_available():
@@ -94,17 +70,8 @@ try:
 except MissingDependencyError:
    pass
 else:
-    _import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoTFLLM",
-            "AutoTFLLMWithTokenizer",
-            "MODEL_TF_MAPPING_NAMES",
-            "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_TF_MAPPING",
-            "MODEL_TF_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["TFFlanT5"])
+    _import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"])


 # declaration for OpenLLM-related modules
@@ -123,17 +90,12 @@ if t.TYPE_CHECKING:
    # Specific types import
    from .configuration_utils import LLMConfig as LLMConfig
    from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
-    from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
-    from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
    from .models.auto import AutoConfig as AutoConfig
-    from .models.auto import AutoTokenizer as AutoTokenizer
    from .models.flan_t5 import \
        START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
    from .models.flan_t5 import FlanT5Config as FlanT5Config
    from .runner_utils import LLMRunnable as LLMRunnable
    from .runner_utils import LLMRunner as LLMRunner
-    from .runner_utils import \
-        generate_tokenizer_runner as generate_tokenizer_runner
    from .schema import PromptTemplate as PromptTemplate
    from .server_utils import start as start
    from .server_utils import start_grpc as start_grpc
@@ -146,16 +108,8 @@ if t.TYPE_CHECKING:
    else:
        from .models.auto import MODEL_MAPPING as MODEL_MAPPING
        from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES
        from .models.auto import AutoLLM as AutoLLM
-        from .models.auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
        from .models.flan_t5 import FlanT5 as FlanT5
-        from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
-        from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer

    try:
        if not imports.is_flax_available():
@@ -166,18 +120,8 @@ if t.TYPE_CHECKING:
        from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
        from .models.auto import \
            MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
        from .models.auto import AutoFlaxLLM as AutoFlaxLLM
-        from .models.auto import \
-            AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
        from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
-        from .models.flan_t5 import \
-            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer

    try:
        if not imports.is_tf_available():
@@ -188,17 +132,8 @@ if t.TYPE_CHECKING:
        from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
        from .models.auto import \
            MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
        from .models.auto import AutoTFLLM as AutoTFLLM
-        from .models.auto import \
-            AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
        from .models.flan_t5 import TFFlanT5 as TFFlanT5
-        from .models.flan_t5 import \
-            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer

 else:
    import sys
--- a/src/openllm/models/auto/init.py
+++ b/src/openllm/models/auto/init.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc."""
+"""This module is derived from HuggingFace's AutoConfig, AutoModel, etc."""

 from __future__ import annotations

@@ -25,7 +25,6 @@ from ...utils import import_utils_shim as imports

 _import_structure = {
    "configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"],
-    "tokenization_auto": ["AutoTokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"],
 }

 try:
@@ -34,14 +33,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
    pass
 else:
-    _import_structure["modeling_auto"] = [
-        "AutoLLM",
-        "AutoLLMWithTokenizer",
-        "MODEL_MAPPING_NAMES",
-        "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_MAPPING",
-        "MODEL_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_auto"] = ["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]

 try:
    if not imports.is_flax_available():
@@ -49,14 +41,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
    pass
 else:
-    _import_structure["modeling_flax_auto"] = [
-        "AutoFlaxLLM",
-        "AutoFlaxLLMWithTokenizer",
-        "MODEL_FLAX_MAPPING_NAMES",
-        "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_FLAX_MAPPING",
-        "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_flax_auto"] = ["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"]

 try:
    if not imports.is_tf_available():
@@ -64,24 +49,13 @@ try:
 except openllm.exceptions.MissingDependencyError:
    pass
 else:
-    _import_structure["modeling_tf_auto"] = [
-        "AutoTFLLM",
-        "AutoTFLLMWithTokenizer",
-        "MODEL_TF_MAPPING_NAMES",
-        "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_TF_MAPPING",
-        "MODEL_TF_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_tf_auto"] = ["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"]

 if t.TYPE_CHECKING:
    from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING
    from .configuration_auto import \
        CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
    from .configuration_auto import AutoConfig as AutoConfig
-    from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
-    from .tokenization_auto import \
-        TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
-    from .tokenization_auto import AutoTokenizer as AutoTokenizer

    try:
        if not imports.is_torch_available():
@@ -91,13 +65,7 @@ if t.TYPE_CHECKING:
    else:
        from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING
        from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-        from .modeling_auto import \
-            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
-        from .modeling_auto import \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES
        from .modeling_auto import AutoLLM as AutoLLM
-        from .modeling_auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer

    try:
        if not imports.is_flax_available():
@@ -109,15 +77,7 @@ if t.TYPE_CHECKING:
            MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
        from .modeling_flax_auto import \
            MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-        from .modeling_flax_auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING
-        from .modeling_flax_auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
        from .modeling_flax_auto import AutoFlaxLLM as AutoFlaxLLM
-        from .modeling_flax_auto import \
-            AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer

    try:
        if not imports.is_tf_available():
@@ -128,14 +88,7 @@ if t.TYPE_CHECKING:
        from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
        from .modeling_tf_auto import \
            MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-        from .modeling_tf_auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
-        from .modeling_tf_auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
        from .modeling_tf_auto import AutoTFLLM as AutoTFLLM
-        from .modeling_tf_auto import \
-            AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
 else:
    import sys

--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -23,7 +23,8 @@ import openllm

 from .configuration_auto import AutoConfig

-def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable[t.Any, t.Any]]:
+
+def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable]:
    supported_runnables = runnable_mapping[type(config)]
    if not isinstance(supported_runnables, (list, tuple)):
        return supported_runnables
@@ -72,7 +73,7 @@ class _BaseAutoRunnerFactory:
        )

    @classmethod
-    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]):
+    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable]):
        """
        Register a new model for this class.

--- a/src/openllm/models/auto/modeling_auto.py
+++ b/src/openllm/models/auto/modeling_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping

 MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")])

-MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")])
-
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)

-MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES)
-

 class AutoLLM(_BaseAutoRunnerFactory):
    _model_mapping = MODEL_MAPPING
-
-
-class AutoLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_WITH_TOKENIZER_MAPPING
--- a/src/openllm/models/auto/modeling_flax_auto.py
+++ b/src/openllm/models/auto/modeling_flax_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping

 MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")])

-MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
-
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)

-MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES)
-

 class AutoFlaxLLM(_BaseAutoRunnerFactory):
    _model_mapping = MODEL_FLAX_MAPPING
-
-
-class AutoFlaxLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING
--- a/src/openllm/models/auto/modeling_tf_auto.py
+++ b/src/openllm/models/auto/modeling_tf_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping

 MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")])

-MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
-
-MODEL_TF_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
-
-MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES)
+MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)


 class AutoTFLLM(_BaseAutoRunnerFactory):
    _model_mapping = MODEL_TF_MAPPING
-
-
-class AutoTFLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING
--- a/src/openllm/models/auto/tokenization_auto.py
+++ b/src/openllm/models/auto/tokenization_auto.py
@@ -1,46 +0,0 @@
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import typing as t
-from collections import OrderedDict
-
-import openllm
-
-from .configuration_auto import _LazyConfigMapping
-
-TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")])
-
-TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES)
-
-
-class AutoTokenizer:
-    def __init__(self):
-        raise EnvironmentError(
-            "This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead"
-        )
-
-    @classmethod
-    def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
-        model_name = openllm.utils.kebab_to_snake_case(model_name)
-        if model_name in TOKENIZER_MAPPING:
-            tokenizer_class = TOKENIZER_MAPPING[model_name]
-            if pretrained_or_path is None:
-                pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name)
-            return tokenizer_class(pretrained_or_path, **kwargs)
-        raise ValueError(
-            f"Unrecognized model {model_name} to build an Tokenizer.\n"
-            f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}."
-        )
--- a/src/openllm/models/chatglm/init.py
+++ b/src/openllm/models/chatglm/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/models/dolly_v2/init.py
+++ b/src/openllm/models/dolly_v2/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/models/flan_t5/init.py
+++ b/src/openllm/models/flan_t5/init.py
@@ -21,7 +21,7 @@ from openllm.utils import import_utils_shim as imports

 _import_structure = {
    "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-    "service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"],
+    "service_flan_t5": ["svc", "model_runner", "generate"],
 }

 try:
@@ -30,7 +30,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
    pass
 else:
-    _import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]
+    _import_structure["modeling_flan_t5"] = ["FlanT5"]

 try:
    if not imports.is_flax_available():
@@ -38,7 +38,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
    pass
 else:
-    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]
+    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]

 try:
    if not imports.is_tf_available():
@@ -46,7 +46,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
    pass
 else:
-    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"]
+    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5"]


 if t.TYPE_CHECKING:
@@ -55,6 +55,7 @@ if t.TYPE_CHECKING:
    from .configuration_flan_t5 import \
        START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
    from .configuration_flan_t5 import FlanT5Config as FlanT5Config
+    from .service_flan_t5 import svc as svc

    try:
        if not imports.is_torch_available():
@@ -63,9 +64,6 @@ if t.TYPE_CHECKING:
        pass
    else:
        from .modeling_flan_t5 import FlanT5 as FlanT5
-        from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
-        from .modeling_flan_t5 import \
-            FlanT5WithTokenizer as FlanT5WithTokenizer

    try:
        if not imports.is_flax_available():
@@ -74,8 +72,6 @@ if t.TYPE_CHECKING:
        pass
    else:
        from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
-        from .modeling_flax_flan_t5 import \
-            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer

    try:
        if not imports.is_tf_available():
@@ -84,8 +80,6 @@ if t.TYPE_CHECKING:
        pass
    else:
        from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
-        from .modeling_tf_flan_t5 import \
-            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
 else:
    import sys

--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -17,8 +17,7 @@ import typing as t

 import openllm

-from ...runner_utils import (LLMRunnable, assign_start_model_name,
-                             generate_tokenizer_runner)
+from ...runner_utils import LLMRunnable
 from .configuration_flan_t5 import FlanT5Config

 if t.TYPE_CHECKING:
@@ -62,37 +61,11 @@ def import_model(
        return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})


-def _FlanT5Tokenizer(
-    pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any
-) -> openllm.types.TokenizerRunner:
-    """Get the runner for the tokenizer.
-
-    Args:
-        model_name: The name of the FLAN-T5 model to import.
-        embedded: Whether to use the embedded runner or not.
-        **kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
-
-    Returns:
-        The runner for the tokenizer.
-    """
-    if pretrained_or_path is None:
-        pretrained_or_path = FlanT5.default_model
-
-    return generate_tokenizer_runner(
-        import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded
-    )
-
-
-FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer)
-
-
-class FlanT5(
-    LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class FlanT5(LLMRunnable, start_model_name="flan-t5"):
    default_model: str = "google/flan-t5-large"
    config_class = FlanT5Config

-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True

    _llm_config: FlanT5Config

@@ -106,9 +79,10 @@ class FlanT5(
        "google/flan-t5-xxl",
    ]

+    @torch.inference_mode()
    def _generate(
        self,
-        input_ids: torch.Tensor,
+        prompt: str,
        max_length: int | None = None,
        do_sample: bool = True,
        temperature: float | None = None,
@@ -117,7 +91,9 @@ class FlanT5(
        repetition_penalty: float | None = None,
        **kwargs: t.Any,
    ) -> torch.Tensor:
-        return self.model.generate(
+        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+        input_ids = input_ids.to(self.device)
+        outputs = self.model.generate(
            input_ids,
            max_length=max_length if max_length is not None else self._llm_config.max_length,
            do_sample=do_sample,
@@ -129,15 +105,4 @@ class FlanT5(
            else self._llm_config.repetition_penalty,
            **kwargs,
        )
-
-
-class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
-        input_ids = input_ids.to(self.device)
-        outputs = super()._generate(input_ids, **kwargs)
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -61,13 +61,11 @@ def import_model(
        return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})


-class FlaxFlanT5(
-    LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class FlaxFlanT5(LLMRunnable, start_model_name="flan-t5"):
    default_model: str = "google/flan-t5-large"
    config_class = FlanT5Config

-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True

    _llm_config: FlanT5Config

@@ -81,7 +79,7 @@ class FlaxFlanT5(

    def _generate(
        self,
-        input_ids: jnp.ndarray,
+        prompt: str,
        max_length: int | None = None,
        do_sample: bool = True,
        temperature: float | None = None,
@@ -90,7 +88,8 @@ class FlaxFlanT5(
        repetition_penalty: float | None = None,
        **kwargs: t.Any,
    ) -> jnp.ndarray:
-        return self.model.generate(
+        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
+        outputs = self.model.generate(
            input_ids,
            max_length=max_length if max_length is not None else self._llm_config.max_length,
            do_sample=do_sample,
@@ -102,16 +101,6 @@ class FlaxFlanT5(
            else self._llm_config.repetition_penalty,
            **kwargs,
        )
-
-
-class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
-        outputs = super()._generate(input_ids, **kwargs)
        return self.tokenizer.batch_decode(
            outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -60,13 +60,11 @@ def import_model(
        return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})


-class TFFlanT5(
-    LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class TFFlanT5(LLMRunnable, start_model_name="flan-t5"):
    default_model: str = "google/flan-t5-large"
    config_class = FlanT5Config

-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True

    _llm_config: FlanT5Config

@@ -80,7 +78,7 @@ class TFFlanT5(

    def _generate(
        self,
-        input_ids: tf.Tensor,
+        prompt: str,
        max_length: int | None = None,
        do_sample: bool = True,
        temperature: float | None = None,
@@ -89,7 +87,8 @@ class TFFlanT5(
        repetition_penalty: float | None = None,
        **kwargs: t.Any,
    ) -> tf.Tensor:
-        return self.model.generate(
+        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
+        outputs = self.model.generate(
            input_ids,
            max_length=max_length if max_length is not None else self._llm_config.max_length,
            do_sample=do_sample,
@@ -101,14 +100,4 @@ class TFFlanT5(
            else self._llm_config.repetition_penalty,
            **kwargs,
        )
-
-
-class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
-        outputs = super()._generate(input_ids, **kwargs)
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/src/openllm/models/flan_t5/service_flan_t5.py
+++ b/src/openllm/models/flan_t5/service_flan_t5.py
@@ -31,9 +31,8 @@ else:
    raise ValueError(f"Invalid framework {framework}")

 model_runner = klass.create_runner("flan-t5")
-tokenizer_runner = openllm.AutoTokenizer.create_runner("flan-t5")

-svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner])
+svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner])


@svc.api(
@@ -43,17 +42,7 @@ svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), ru
 async def generate(qa: openllm.schema.GenerateInput) -> openllm.schema.GenerateOutput:
    """Returns the generated text from given prompts."""
    llm_config = model_runner.llm_config.with_options(**qa.llm_config).dict()
-
-    return_tensors = "np" if framework == "flax" else framework
-    input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors)
-    if framework == "flax":
-        outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config)
-        responses = await tokenizer_runner.batch_decode.async_run(
-            outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-    else:
-        outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config)
-        responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True)
+    responses = await model_runner.generate.async_run(qa.prompt, **llm_config)
    return openllm.schema.GenerateOutput(responses=responses, configuration=llm_config)


--- a/src/openllm/models/gpt_neox/init.py
+++ b/src/openllm/models/gpt_neox/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/models/gptj/init.py
+++ b/src/openllm/models/gptj/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/models/llama/init.py
+++ b/src/openllm/models/llama/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/models/roberta/init.py
+++ b/src/openllm/models/roberta/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/models/stablelm/init.py
+++ b/src/openllm/models/stablelm/init.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
--- a/src/openllm/runner_utils.py
+++ b/src/openllm/runner_utils.py
@@ -44,9 +44,6 @@ else:

 logger = logging.getLogger(__name__)

-M = t.TypeVar("M")
-T = t.TypeVar("T")
-

 def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]:
    def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
@@ -156,7 +153,7 @@ class BaseLLMRunnable(bentoml.Runnable, ABC):


 # TODO: Add support for model validation
-class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
+class LLMRunnable(BaseLLMRunnable):
    # The section below defines a loose contract with langchain's LLM interface.
    @property
    def _llm_type(self) -> str:
@@ -173,8 +170,8 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):

    # XXX: INTERNAL
    _module: LLMModuleType
-    _model: M | None = None
-    _tokenizer: T | None = None
+    _model: t.Any | None = None
+    _tokenizer: t.Any | None = None

    def __setattr__(self, attr_name: str, value: t.Any) -> None:
        if attr_name in ("ATTACH_TOKENIZER",):
@@ -249,18 +246,18 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
        return super().__getattribute__(item)

    @classmethod
-    def dummy_object(cls) -> LLMRunnable[M, T]:
+    def dummy_object(cls) -> LLMRunnable:
        return cls(_dummy=True, _internal=True)

    @property
-    def model(self) -> M:
+    def model(self) -> t.Any:
        # NOTE: should we have support for nested runner here?
        if self._model is None:
            self._model = self._bentomodel.load_model()
        return self._model

    @property
-    def tokenizer(self) -> T:
+    def tokenizer(self) -> t.Any:
        # This is the runner generated from the bento model. This can
        # then be used for implementation of _generate.
        if self._tokenizer is None:
@@ -368,7 +365,7 @@ class LLMRunner(bentoml.Runner):

    def __init__(
        self,
-        runnable_class: type[LLMRunnable[t.Any, t.Any]],
+        runnable_class: type[LLMRunnable],
        llm_config: LLMConfig,
        **kwargs: t.Any,
    ):
--- a/src/openllm/types.py
+++ b/src/openllm/types.py
@@ -48,14 +48,10 @@ class LLMModuleType(LazyLoader):
    ) -> bentoml.Model:
        ...

-    class LLMConfigImpl(LLMConfig):
+    class LLMConfigImpl(LLMConfig, model_name="dummy"):
        ...

-    class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"):
-        ...
-
-    @staticmethod
-    def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner:
+    class LLMRunnableImpl(LLMRunnable, start_model_name="dummy"):
        ...
				`@@ -0,0 +1 @@`
				`raise NotImplementedError("This module is not implemented yet.")`