From 2a53faee9c26cbac78e2a7849c61d80bea882f83 Mon Sep 17 00:00:00 2001
From: Aaron <29749331+aarnphm@users.noreply.github.com>
Date: Fri, 5 May 2023 11:57:39 -0700
Subject: [PATCH] infra: add structure and cleanup separation of tokenizer

since tokenizer are relatively light, all default LLM will bundle the
tokenizer with itself.

Maybe we can put the tokenizer in its own runner in the future

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/__init__.py                       | 81 ++-----------------
 src/openllm/models/auto/__init__.py           | 55 +------------
 src/openllm/models/auto/factory.py            |  5 +-
 src/openllm/models/auto/modeling_auto.py      |  8 --
 src/openllm/models/auto/modeling_flax_auto.py |  8 --
 src/openllm/models/auto/modeling_tf_auto.py   | 10 +--
 src/openllm/models/auto/tokenization_auto.py  | 46 -----------
 src/openllm/models/chatglm/__init__.py        |  1 +
 src/openllm/models/dolly_v2/__init__.py       |  1 +
 src/openllm/models/flan_t5/__init__.py        | 16 ++--
 .../models/flan_t5/modeling_flan_t5.py        | 51 ++----------
 .../models/flan_t5/modeling_flax_flan_t5.py   | 21 ++---
 .../models/flan_t5/modeling_tf_flan_t5.py     | 21 ++---
 src/openllm/models/flan_t5/service_flan_t5.py | 15 +---
 src/openllm/models/gpt_neox/__init__.py       |  1 +
 src/openllm/models/gptj/__init__.py           |  1 +
 src/openllm/models/llama/__init__.py          |  1 +
 src/openllm/models/roberta/__init__.py        |  1 +
 src/openllm/models/stablelm/__init__.py       |  1 +
 src/openllm/runner_utils.py                   | 17 ++--
 src/openllm/types.py                          |  8 +-
 21 files changed, 57 insertions(+), 312 deletions(-)
 delete mode 100644 src/openllm/models/auto/tokenization_auto.py
 create mode 100644 src/openllm/models/chatglm/__init__.py
 create mode 100644 src/openllm/models/dolly_v2/__init__.py
 create mode 100644 src/openllm/models/gpt_neox/__init__.py
 create mode 100644 src/openllm/models/gptj/__init__.py
 create mode 100644 src/openllm/models/llama/__init__.py
 create mode 100644 src/openllm/models/roberta/__init__.py
 create mode 100644 src/openllm/models/stablelm/__init__.py

diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py
index cc7b1638..33ae1c07 100644
--- a/src/openllm/__init__.py
+++ b/src/openllm/__init__.py
@@ -34,7 +34,7 @@ _import_structure = {
     "cli": [],
     "configuration_utils": ["LLMConfig"],
     "exceptions": [],
-    "runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"],
+    "runner_utils": ["LLMRunner", "LLMRunnable"],
     "schema": ["PromptTemplate"],
     "server_utils": ["start", "start_grpc"],
     "types": [],
@@ -42,13 +42,7 @@ _import_structure = {
     "models": [],
     "client": [],
     # NOTE: models
-    "models.auto": [
-        "AutoConfig",
-        "CONFIG_MAPPING",
-        "AutoTokenizer",
-        "TOKENIZER_MAPPING",
-        "TOKENIZER_MAPPING_NAMES",
-    ],
+    "models.auto": ["AutoConfig", "CONFIG_MAPPING"],
     "models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
 }
 
@@ -58,17 +52,8 @@ try:
 except MissingDependencyError:
     pass
 else:
-    _import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoLLM",
-            "AutoLLMWithTokenizer",
-            "MODEL_MAPPING_NAMES",
-            "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_MAPPING",
-            "MODEL_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["FlanT5"])
+    _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
 
 try:
     if not imports.is_flax_available():
@@ -76,17 +61,8 @@ try:
 except MissingDependencyError:
     pass
 else:
-    _import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoFlaxLLM",
-            "AutoFlaxLLMWithTokenizer",
-            "MODEL_FLAX_MAPPING_NAMES",
-            "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_FLAX_MAPPING",
-            "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
+    _import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"])
 
 try:
     if not imports.is_tf_available():
@@ -94,17 +70,8 @@ try:
 except MissingDependencyError:
     pass
 else:
-    _import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoTFLLM",
-            "AutoTFLLMWithTokenizer",
-            "MODEL_TF_MAPPING_NAMES",
-            "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_TF_MAPPING",
-            "MODEL_TF_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["TFFlanT5"])
+    _import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"])
 
 
 # declaration for OpenLLM-related modules
@@ -123,17 +90,12 @@ if t.TYPE_CHECKING:
     # Specific types import
     from .configuration_utils import LLMConfig as LLMConfig
     from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
-    from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
-    from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
     from .models.auto import AutoConfig as AutoConfig
-    from .models.auto import AutoTokenizer as AutoTokenizer
     from .models.flan_t5 import \
         START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
     from .models.flan_t5 import FlanT5Config as FlanT5Config
     from .runner_utils import LLMRunnable as LLMRunnable
     from .runner_utils import LLMRunner as LLMRunner
-    from .runner_utils import \
-        generate_tokenizer_runner as generate_tokenizer_runner
     from .schema import PromptTemplate as PromptTemplate
     from .server_utils import start as start
     from .server_utils import start_grpc as start_grpc
@@ -146,16 +108,8 @@ if t.TYPE_CHECKING:
     else:
         from .models.auto import MODEL_MAPPING as MODEL_MAPPING
         from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES
         from .models.auto import AutoLLM as AutoLLM
-        from .models.auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
         from .models.flan_t5 import FlanT5 as FlanT5
-        from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
-        from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer
 
     try:
         if not imports.is_flax_available():
@@ -166,18 +120,8 @@ if t.TYPE_CHECKING:
         from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
         from .models.auto import \
             MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
         from .models.auto import AutoFlaxLLM as AutoFlaxLLM
-        from .models.auto import \
-            AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
         from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
-        from .models.flan_t5 import \
-            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
 
     try:
         if not imports.is_tf_available():
@@ -188,17 +132,8 @@ if t.TYPE_CHECKING:
         from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
         from .models.auto import \
             MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
         from .models.auto import AutoTFLLM as AutoTFLLM
-        from .models.auto import \
-            AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
         from .models.flan_t5 import TFFlanT5 as TFFlanT5
-        from .models.flan_t5 import \
-            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
 
 else:
     import sys
diff --git a/src/openllm/models/auto/__init__.py b/src/openllm/models/auto/__init__.py
index 53bd82b2..1ea6f531 100644
--- a/src/openllm/models/auto/__init__.py
+++ b/src/openllm/models/auto/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc."""
+"""This module is derived from HuggingFace's AutoConfig, AutoModel, etc."""
 
 from __future__ import annotations
 
@@ -25,7 +25,6 @@ from ...utils import import_utils_shim as imports
 
 _import_structure = {
     "configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"],
-    "tokenization_auto": ["AutoTokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"],
 }
 
 try:
@@ -34,14 +33,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_auto"] = [
-        "AutoLLM",
-        "AutoLLMWithTokenizer",
-        "MODEL_MAPPING_NAMES",
-        "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_MAPPING",
-        "MODEL_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_auto"] = ["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]
 
 try:
     if not imports.is_flax_available():
@@ -49,14 +41,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flax_auto"] = [
-        "AutoFlaxLLM",
-        "AutoFlaxLLMWithTokenizer",
-        "MODEL_FLAX_MAPPING_NAMES",
-        "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_FLAX_MAPPING",
-        "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_flax_auto"] = ["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"]
 
 try:
     if not imports.is_tf_available():
@@ -64,24 +49,13 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_tf_auto"] = [
-        "AutoTFLLM",
-        "AutoTFLLMWithTokenizer",
-        "MODEL_TF_MAPPING_NAMES",
-        "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_TF_MAPPING",
-        "MODEL_TF_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_tf_auto"] = ["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"]
 
 if t.TYPE_CHECKING:
     from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING
     from .configuration_auto import \
         CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
     from .configuration_auto import AutoConfig as AutoConfig
-    from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
-    from .tokenization_auto import \
-        TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
-    from .tokenization_auto import AutoTokenizer as AutoTokenizer
 
     try:
         if not imports.is_torch_available():
@@ -91,13 +65,7 @@ if t.TYPE_CHECKING:
     else:
         from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING
         from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-        from .modeling_auto import \
-            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
-        from .modeling_auto import \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES
         from .modeling_auto import AutoLLM as AutoLLM
-        from .modeling_auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
 
     try:
         if not imports.is_flax_available():
@@ -109,15 +77,7 @@ if t.TYPE_CHECKING:
             MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
         from .modeling_flax_auto import \
             MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-        from .modeling_flax_auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING
-        from .modeling_flax_auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
         from .modeling_flax_auto import AutoFlaxLLM as AutoFlaxLLM
-        from .modeling_flax_auto import \
-            AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
 
     try:
         if not imports.is_tf_available():
@@ -128,14 +88,7 @@ if t.TYPE_CHECKING:
         from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
         from .modeling_tf_auto import \
             MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-        from .modeling_tf_auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
-        from .modeling_tf_auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
         from .modeling_tf_auto import AutoTFLLM as AutoTFLLM
-        from .modeling_tf_auto import \
-            AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
 else:
     import sys
 
diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py
index 0a21c4de..347059b8 100644
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -23,7 +23,8 @@ import openllm
 
 from .configuration_auto import AutoConfig
 
-def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable[t.Any, t.Any]]:
+
+def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable]:
     supported_runnables = runnable_mapping[type(config)]
     if not isinstance(supported_runnables, (list, tuple)):
         return supported_runnables
@@ -72,7 +73,7 @@ class _BaseAutoRunnerFactory:
         )
 
     @classmethod
-    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]):
+    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable]):
         """
         Register a new model for this class.
 
diff --git a/src/openllm/models/auto/modeling_auto.py b/src/openllm/models/auto/modeling_auto.py
index 28478dc9..cdea672f 100644
--- a/src/openllm/models/auto/modeling_auto.py
+++ b/src/openllm/models/auto/modeling_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
 
 MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")])
 
-MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")])
-
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 
-MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES)
-
 
 class AutoLLM(_BaseAutoRunnerFactory):
     _model_mapping = MODEL_MAPPING
-
-
-class AutoLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/modeling_flax_auto.py b/src/openllm/models/auto/modeling_flax_auto.py
index 3429b902..dfb8e087 100644
--- a/src/openllm/models/auto/modeling_flax_auto.py
+++ b/src/openllm/models/auto/modeling_flax_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
 
 MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")])
 
-MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
-
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
 
-MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES)
-
 
 class AutoFlaxLLM(_BaseAutoRunnerFactory):
     _model_mapping = MODEL_FLAX_MAPPING
-
-
-class AutoFlaxLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/modeling_tf_auto.py b/src/openllm/models/auto/modeling_tf_auto.py
index dbb3c762..668bfe3c 100644
--- a/src/openllm/models/auto/modeling_tf_auto.py
+++ b/src/openllm/models/auto/modeling_tf_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
 
 MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")])
 
-MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
-
-MODEL_TF_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
-
-MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES)
+MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
 
 
 class AutoTFLLM(_BaseAutoRunnerFactory):
     _model_mapping = MODEL_TF_MAPPING
-
-
-class AutoTFLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/tokenization_auto.py b/src/openllm/models/auto/tokenization_auto.py
deleted file mode 100644
index a7ebfeac..00000000
--- a/src/openllm/models/auto/tokenization_auto.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import typing as t
-from collections import OrderedDict
-
-import openllm
-
-from .configuration_auto import _LazyConfigMapping
-
-TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")])
-
-TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES)
-
-
-class AutoTokenizer:
-    def __init__(self):
-        raise EnvironmentError(
-            "This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead"
-        )
-
-    @classmethod
-    def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
-        model_name = openllm.utils.kebab_to_snake_case(model_name)
-        if model_name in TOKENIZER_MAPPING:
-            tokenizer_class = TOKENIZER_MAPPING[model_name]
-            if pretrained_or_path is None:
-                pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name)
-            return tokenizer_class(pretrained_or_path, **kwargs)
-        raise ValueError(
-            f"Unrecognized model {model_name} to build an Tokenizer.\n"
-            f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}."
-        )
diff --git a/src/openllm/models/chatglm/__init__.py b/src/openllm/models/chatglm/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/chatglm/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/dolly_v2/__init__.py b/src/openllm/models/dolly_v2/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/dolly_v2/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/flan_t5/__init__.py b/src/openllm/models/flan_t5/__init__.py
index 567e8c8d..b8e52a44 100644
--- a/src/openllm/models/flan_t5/__init__.py
+++ b/src/openllm/models/flan_t5/__init__.py
@@ -21,7 +21,7 @@ from openllm.utils import import_utils_shim as imports
 
 _import_structure = {
     "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-    "service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"],
+    "service_flan_t5": ["svc", "model_runner", "generate"],
 }
 
 try:
@@ -30,7 +30,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]
+    _import_structure["modeling_flan_t5"] = ["FlanT5"]
 
 try:
     if not imports.is_flax_available():
@@ -38,7 +38,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]
+    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
 
 try:
     if not imports.is_tf_available():
@@ -46,7 +46,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"]
+    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5"]
 
 
 if t.TYPE_CHECKING:
@@ -55,6 +55,7 @@ if t.TYPE_CHECKING:
     from .configuration_flan_t5 import \
         START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
     from .configuration_flan_t5 import FlanT5Config as FlanT5Config
+    from .service_flan_t5 import svc as svc
 
     try:
         if not imports.is_torch_available():
@@ -63,9 +64,6 @@ if t.TYPE_CHECKING:
         pass
     else:
         from .modeling_flan_t5 import FlanT5 as FlanT5
-        from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
-        from .modeling_flan_t5 import \
-            FlanT5WithTokenizer as FlanT5WithTokenizer
 
     try:
         if not imports.is_flax_available():
@@ -74,8 +72,6 @@ if t.TYPE_CHECKING:
         pass
     else:
         from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
-        from .modeling_flax_flan_t5 import \
-            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
 
     try:
         if not imports.is_tf_available():
@@ -84,8 +80,6 @@ if t.TYPE_CHECKING:
         pass
     else:
         from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
-        from .modeling_tf_flan_t5 import \
-            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
 else:
     import sys
 
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index fd1d0c54..0758d379 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -17,8 +17,7 @@ import typing as t
 
 import openllm
 
-from ...runner_utils import (LLMRunnable, assign_start_model_name,
-                             generate_tokenizer_runner)
+from ...runner_utils import LLMRunnable
 from .configuration_flan_t5 import FlanT5Config
 
 if t.TYPE_CHECKING:
@@ -62,37 +61,11 @@ def import_model(
         return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
 
 
-def _FlanT5Tokenizer(
-    pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any
-) -> openllm.types.TokenizerRunner:
-    """Get the runner for the tokenizer.
-
-    Args:
-        model_name: The name of the FLAN-T5 model to import.
-        embedded: Whether to use the embedded runner or not.
-        **kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
-
-    Returns:
-        The runner for the tokenizer.
-    """
-    if pretrained_or_path is None:
-        pretrained_or_path = FlanT5.default_model
-
-    return generate_tokenizer_runner(
-        import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded
-    )
-
-
-FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer)
-
-
-class FlanT5(
-    LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class FlanT5(LLMRunnable, start_model_name="flan-t5"):
     default_model: str = "google/flan-t5-large"
     config_class = FlanT5Config
 
-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True
 
     _llm_config: FlanT5Config
 
@@ -106,9 +79,10 @@ class FlanT5(
         "google/flan-t5-xxl",
     ]
 
+    @torch.inference_mode()
     def _generate(
         self,
-        input_ids: torch.Tensor,
+        prompt: str,
         max_length: int | None = None,
         do_sample: bool = True,
         temperature: float | None = None,
@@ -117,7 +91,9 @@ class FlanT5(
         repetition_penalty: float | None = None,
         **kwargs: t.Any,
     ) -> torch.Tensor:
-        return self.model.generate(
+        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+        input_ids = input_ids.to(self.device)
+        outputs = self.model.generate(
             input_ids,
             max_length=max_length if max_length is not None else self._llm_config.max_length,
             do_sample=do_sample,
@@ -129,15 +105,4 @@ class FlanT5(
             else self._llm_config.repetition_penalty,
             **kwargs,
         )
-
-
-class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
-        input_ids = input_ids.to(self.device)
-        outputs = super()._generate(input_ids, **kwargs)
         return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 4fb1abd8..b742241d 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -61,13 +61,11 @@ def import_model(
         return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
 
 
-class FlaxFlanT5(
-    LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class FlaxFlanT5(LLMRunnable, start_model_name="flan-t5"):
     default_model: str = "google/flan-t5-large"
     config_class = FlanT5Config
 
-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True
 
     _llm_config: FlanT5Config
 
@@ -81,7 +79,7 @@ class FlaxFlanT5(
 
     def _generate(
         self,
-        input_ids: jnp.ndarray,
+        prompt: str,
         max_length: int | None = None,
         do_sample: bool = True,
         temperature: float | None = None,
@@ -90,7 +88,8 @@ class FlaxFlanT5(
         repetition_penalty: float | None = None,
         **kwargs: t.Any,
     ) -> jnp.ndarray:
-        return self.model.generate(
+        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
+        outputs = self.model.generate(
             input_ids,
             max_length=max_length if max_length is not None else self._llm_config.max_length,
             do_sample=do_sample,
@@ -102,16 +101,6 @@ class FlaxFlanT5(
             else self._llm_config.repetition_penalty,
             **kwargs,
         )
-
-
-class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
-        outputs = super()._generate(input_ids, **kwargs)
         return self.tokenizer.batch_decode(
             outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
         )
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index da0a999a..6dd45e5e 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -60,13 +60,11 @@ def import_model(
         return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
 
 
-class TFFlanT5(
-    LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class TFFlanT5(LLMRunnable, start_model_name="flan-t5"):
     default_model: str = "google/flan-t5-large"
     config_class = FlanT5Config
 
-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True
 
     _llm_config: FlanT5Config
 
@@ -80,7 +78,7 @@ class TFFlanT5(
 
     def _generate(
         self,
-        input_ids: tf.Tensor,
+        prompt: str,
         max_length: int | None = None,
         do_sample: bool = True,
         temperature: float | None = None,
@@ -89,7 +87,8 @@ class TFFlanT5(
         repetition_penalty: float | None = None,
         **kwargs: t.Any,
     ) -> tf.Tensor:
-        return self.model.generate(
+        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
+        outputs = self.model.generate(
             input_ids,
             max_length=max_length if max_length is not None else self._llm_config.max_length,
             do_sample=do_sample,
@@ -101,14 +100,4 @@ class TFFlanT5(
             else self._llm_config.repetition_penalty,
             **kwargs,
         )
-
-
-class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
-        outputs = super()._generate(input_ids, **kwargs)
         return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/service_flan_t5.py b/src/openllm/models/flan_t5/service_flan_t5.py
index ccc54f80..361e249e 100644
--- a/src/openllm/models/flan_t5/service_flan_t5.py
+++ b/src/openllm/models/flan_t5/service_flan_t5.py
@@ -31,9 +31,8 @@ else:
     raise ValueError(f"Invalid framework {framework}")
 
 model_runner = klass.create_runner("flan-t5")
-tokenizer_runner = openllm.AutoTokenizer.create_runner("flan-t5")
 
-svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner])
+svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner])
 
 
 @svc.api(
@@ -43,17 +42,7 @@ svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), ru
 async def generate(qa: openllm.schema.GenerateInput) -> openllm.schema.GenerateOutput:
     """Returns the generated text from given prompts."""
     llm_config = model_runner.llm_config.with_options(**qa.llm_config).dict()
-
-    return_tensors = "np" if framework == "flax" else framework
-    input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors)
-    if framework == "flax":
-        outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config)
-        responses = await tokenizer_runner.batch_decode.async_run(
-            outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-    else:
-        outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config)
-        responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True)
+    responses = await model_runner.generate.async_run(qa.prompt, **llm_config)
     return openllm.schema.GenerateOutput(responses=responses, configuration=llm_config)
 
 
diff --git a/src/openllm/models/gpt_neox/__init__.py b/src/openllm/models/gpt_neox/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/gpt_neox/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/gptj/__init__.py b/src/openllm/models/gptj/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/gptj/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/llama/__init__.py b/src/openllm/models/llama/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/llama/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/roberta/__init__.py b/src/openllm/models/roberta/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/roberta/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/stablelm/__init__.py b/src/openllm/models/stablelm/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/stablelm/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/runner_utils.py b/src/openllm/runner_utils.py
index 3b81c95b..578c29e6 100644
--- a/src/openllm/runner_utils.py
+++ b/src/openllm/runner_utils.py
@@ -44,9 +44,6 @@ else:
 
 logger = logging.getLogger(__name__)
 
-M = t.TypeVar("M")
-T = t.TypeVar("T")
-
 
 def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]:
     def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
@@ -156,7 +153,7 @@ class BaseLLMRunnable(bentoml.Runnable, ABC):
 
 
 # TODO: Add support for model validation
-class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
+class LLMRunnable(BaseLLMRunnable):
     # The section below defines a loose contract with langchain's LLM interface.
     @property
     def _llm_type(self) -> str:
@@ -173,8 +170,8 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
 
     # XXX: INTERNAL
     _module: LLMModuleType
-    _model: M | None = None
-    _tokenizer: T | None = None
+    _model: t.Any | None = None
+    _tokenizer: t.Any | None = None
 
     def __setattr__(self, attr_name: str, value: t.Any) -> None:
         if attr_name in ("ATTACH_TOKENIZER",):
@@ -249,18 +246,18 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
         return super().__getattribute__(item)
 
     @classmethod
-    def dummy_object(cls) -> LLMRunnable[M, T]:
+    def dummy_object(cls) -> LLMRunnable:
         return cls(_dummy=True, _internal=True)
 
     @property
-    def model(self) -> M:
+    def model(self) -> t.Any:
         # NOTE: should we have support for nested runner here?
         if self._model is None:
             self._model = self._bentomodel.load_model()
         return self._model
 
     @property
-    def tokenizer(self) -> T:
+    def tokenizer(self) -> t.Any:
         # This is the runner generated from the bento model. This can
         # then be used for implementation of _generate.
         if self._tokenizer is None:
@@ -368,7 +365,7 @@ class LLMRunner(bentoml.Runner):
 
     def __init__(
         self,
-        runnable_class: type[LLMRunnable[t.Any, t.Any]],
+        runnable_class: type[LLMRunnable],
         llm_config: LLMConfig,
         **kwargs: t.Any,
     ):
diff --git a/src/openllm/types.py b/src/openllm/types.py
index dab968c6..bb1c1acb 100644
--- a/src/openllm/types.py
+++ b/src/openllm/types.py
@@ -48,14 +48,10 @@ class LLMModuleType(LazyLoader):
     ) -> bentoml.Model:
         ...
 
-    class LLMConfigImpl(LLMConfig):
+    class LLMConfigImpl(LLMConfig, model_name="dummy"):
         ...
 
-    class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"):
-        ...
-
-    @staticmethod
-    def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner:
+    class LLMRunnableImpl(LLMRunnable, start_model_name="dummy"):
         ...