diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py
index cc7b1638..33ae1c07 100644
--- a/src/openllm/__init__.py
+++ b/src/openllm/__init__.py
@@ -34,7 +34,7 @@ _import_structure = {
     "cli": [],
     "configuration_utils": ["LLMConfig"],
     "exceptions": [],
-    "runner_utils": ["LLMRunner", "LLMRunnable", "generate_tokenizer_runner"],
+    "runner_utils": ["LLMRunner", "LLMRunnable"],
     "schema": ["PromptTemplate"],
     "server_utils": ["start", "start_grpc"],
     "types": [],
@@ -42,13 +42,7 @@ _import_structure = {
     "models": [],
     "client": [],
     # NOTE: models
-    "models.auto": [
-        "AutoConfig",
-        "CONFIG_MAPPING",
-        "AutoTokenizer",
-        "TOKENIZER_MAPPING",
-        "TOKENIZER_MAPPING_NAMES",
-    ],
+    "models.auto": ["AutoConfig", "CONFIG_MAPPING"],
     "models.flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING"],
 }
 
@@ -58,17 +52,8 @@ try:
 except MissingDependencyError:
     pass
 else:
-    _import_structure["models.flan_t5"].extend(["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoLLM",
-            "AutoLLMWithTokenizer",
-            "MODEL_MAPPING_NAMES",
-            "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_MAPPING",
-            "MODEL_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["FlanT5"])
+    _import_structure["models.auto"].extend(["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"])
 
 try:
     if not imports.is_flax_available():
@@ -76,17 +61,8 @@ try:
 except MissingDependencyError:
     pass
 else:
-    _import_structure["models.flan_t5"].extend(["FlaxFlanT5", "FlaxFlanT5WithTokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoFlaxLLM",
-            "AutoFlaxLLMWithTokenizer",
-            "MODEL_FLAX_MAPPING_NAMES",
-            "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_FLAX_MAPPING",
-            "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
+    _import_structure["models.auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"])
 
 try:
     if not imports.is_tf_available():
@@ -94,17 +70,8 @@ try:
 except MissingDependencyError:
     pass
 else:
-    _import_structure["models.flan_t5"].extend(["TFFlanT5", "TFFlanT5WithTokenizer"])
-    _import_structure["models.auto"].extend(
-        [
-            "AutoTFLLM",
-            "AutoTFLLMWithTokenizer",
-            "MODEL_TF_MAPPING_NAMES",
-            "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
-            "MODEL_TF_MAPPING",
-            "MODEL_TF_WITH_TOKENIZER_MAPPING",
-        ]
-    )
+    _import_structure["models.flan_t5"].extend(["TFFlanT5"])
+    _import_structure["models.auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"])
 
 
 # declaration for OpenLLM-related modules
@@ -123,17 +90,12 @@ if t.TYPE_CHECKING:
     # Specific types import
     from .configuration_utils import LLMConfig as LLMConfig
     from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING
-    from .models.auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
-    from .models.auto import TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
     from .models.auto import AutoConfig as AutoConfig
-    from .models.auto import AutoTokenizer as AutoTokenizer
     from .models.flan_t5 import \
         START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
     from .models.flan_t5 import FlanT5Config as FlanT5Config
     from .runner_utils import LLMRunnable as LLMRunnable
     from .runner_utils import LLMRunner as LLMRunner
-    from .runner_utils import \
-        generate_tokenizer_runner as generate_tokenizer_runner
     from .schema import PromptTemplate as PromptTemplate
     from .server_utils import start as start
     from .server_utils import start_grpc as start_grpc
@@ -146,16 +108,8 @@ if t.TYPE_CHECKING:
     else:
         from .models.auto import MODEL_MAPPING as MODEL_MAPPING
         from .models.auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES
         from .models.auto import AutoLLM as AutoLLM
-        from .models.auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
         from .models.flan_t5 import FlanT5 as FlanT5
-        from .models.flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
-        from .models.flan_t5 import FlanT5WithTokenizer as FlanT5WithTokenizer
 
     try:
         if not imports.is_flax_available():
@@ -166,18 +120,8 @@ if t.TYPE_CHECKING:
         from .models.auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
         from .models.auto import \
             MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
         from .models.auto import AutoFlaxLLM as AutoFlaxLLM
-        from .models.auto import \
-            AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
         from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
-        from .models.flan_t5 import \
-            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
 
     try:
         if not imports.is_tf_available():
@@ -188,17 +132,8 @@ if t.TYPE_CHECKING:
         from .models.auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
         from .models.auto import \
             MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-        from .models.auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
-        from .models.auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
         from .models.auto import AutoTFLLM as AutoTFLLM
-        from .models.auto import \
-            AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
         from .models.flan_t5 import TFFlanT5 as TFFlanT5
-        from .models.flan_t5 import \
-            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
 
 else:
     import sys
diff --git a/src/openllm/models/auto/__init__.py b/src/openllm/models/auto/__init__.py
index 53bd82b2..1ea6f531 100644
--- a/src/openllm/models/auto/__init__.py
+++ b/src/openllm/models/auto/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""This module is derived from HuggingFace's AutoConfig, Tokenizer, AutoModel, etc."""
+"""This module is derived from HuggingFace's AutoConfig, AutoModel, etc."""
 
 from __future__ import annotations
 
@@ -25,7 +25,6 @@ from ...utils import import_utils_shim as imports
 
 _import_structure = {
     "configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"],
-    "tokenization_auto": ["AutoTokenizer", "TOKENIZER_MAPPING", "TOKENIZER_MAPPING_NAMES"],
 }
 
 try:
@@ -34,14 +33,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_auto"] = [
-        "AutoLLM",
-        "AutoLLMWithTokenizer",
-        "MODEL_MAPPING_NAMES",
-        "MODEL_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_MAPPING",
-        "MODEL_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_auto"] = ["AutoLLM", "MODEL_MAPPING_NAMES", "MODEL_MAPPING"]
 
 try:
     if not imports.is_flax_available():
@@ -49,14 +41,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flax_auto"] = [
-        "AutoFlaxLLM",
-        "AutoFlaxLLMWithTokenizer",
-        "MODEL_FLAX_MAPPING_NAMES",
-        "MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_FLAX_MAPPING",
-        "MODEL_FLAX_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_flax_auto"] = ["AutoFlaxLLM", "MODEL_FLAX_MAPPING_NAMES", "MODEL_FLAX_MAPPING"]
 
 try:
     if not imports.is_tf_available():
@@ -64,24 +49,13 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_tf_auto"] = [
-        "AutoTFLLM",
-        "AutoTFLLMWithTokenizer",
-        "MODEL_TF_MAPPING_NAMES",
-        "MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES",
-        "MODEL_TF_MAPPING",
-        "MODEL_TF_WITH_TOKENIZER_MAPPING",
-    ]
+    _import_structure["modeling_tf_auto"] = ["AutoTFLLM", "MODEL_TF_MAPPING_NAMES", "MODEL_TF_MAPPING"]
 
 if t.TYPE_CHECKING:
     from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING
     from .configuration_auto import \
         CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
     from .configuration_auto import AutoConfig as AutoConfig
-    from .tokenization_auto import TOKENIZER_MAPPING as TOKENIZER_MAPPING
-    from .tokenization_auto import \
-        TOKENIZER_MAPPING_NAMES as TOKENIZER_MAPPING_NAMES
-    from .tokenization_auto import AutoTokenizer as AutoTokenizer
 
     try:
         if not imports.is_torch_available():
@@ -91,13 +65,7 @@ if t.TYPE_CHECKING:
     else:
         from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING
         from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
-        from .modeling_auto import \
-            MODEL_WITH_TOKENIZER_MAPPING as MODEL_WITH_TOKENIZER_MAPPING
-        from .modeling_auto import \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_WITH_TOKENIZER_MAPPING_NAMES
         from .modeling_auto import AutoLLM as AutoLLM
-        from .modeling_auto import AutoLLMWithTokenizer as AutoLLMWithTokenizer
 
     try:
         if not imports.is_flax_available():
@@ -109,15 +77,7 @@ if t.TYPE_CHECKING:
             MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING
         from .modeling_flax_auto import \
             MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
-        from .modeling_flax_auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING
-        from .modeling_flax_auto import \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES
         from .modeling_flax_auto import AutoFlaxLLM as AutoFlaxLLM
-        from .modeling_flax_auto import \
-            AutoFlaxLLMWithTokenizer as AutoFlaxLLMWithTokenizer
 
     try:
         if not imports.is_tf_available():
@@ -128,14 +88,7 @@ if t.TYPE_CHECKING:
         from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING
         from .modeling_tf_auto import \
             MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES
-        from .modeling_tf_auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING as MODEL_TF_WITH_TOKENIZER_MAPPING
-        from .modeling_tf_auto import \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES as \
-            MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES
         from .modeling_tf_auto import AutoTFLLM as AutoTFLLM
-        from .modeling_tf_auto import \
-            AutoTFLLMWithTokenizer as AutoTFLLMWithTokenizer
 else:
     import sys
 
diff --git a/src/openllm/models/auto/factory.py b/src/openllm/models/auto/factory.py
index 0a21c4de..347059b8 100644
--- a/src/openllm/models/auto/factory.py
+++ b/src/openllm/models/auto/factory.py
@@ -23,7 +23,8 @@ import openllm
 
 from .configuration_auto import AutoConfig
 
-def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable[t.Any, t.Any]]:
+
+def _get_runnable_class(config: openllm.LLMConfig, runnable_mapping: _LazyAutoMapping) -> type[openllm.LLMRunnable]:
     supported_runnables = runnable_mapping[type(config)]
     if not isinstance(supported_runnables, (list, tuple)):
         return supported_runnables
@@ -72,7 +73,7 @@ class _BaseAutoRunnerFactory:
         )
 
     @classmethod
-    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable[t.Any, t.Any]]):
+    def register(cls, config_class: type[openllm.LLMConfig], runnable_class: type[openllm.LLMRunnable]):
         """
         Register a new model for this class.
 
diff --git a/src/openllm/models/auto/modeling_auto.py b/src/openllm/models/auto/modeling_auto.py
index 28478dc9..cdea672f 100644
--- a/src/openllm/models/auto/modeling_auto.py
+++ b/src/openllm/models/auto/modeling_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
 
 MODEL_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5")])
 
-MODEL_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5WithTokenizer")])
-
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 
-MODEL_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_TOKENIZER_MAPPING_NAMES)
-
 
 class AutoLLM(_BaseAutoRunnerFactory):
     _model_mapping = MODEL_MAPPING
-
-
-class AutoLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/modeling_flax_auto.py b/src/openllm/models/auto/modeling_flax_auto.py
index 3429b902..dfb8e087 100644
--- a/src/openllm/models/auto/modeling_flax_auto.py
+++ b/src/openllm/models/auto/modeling_flax_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
 
 MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5")])
 
-MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
-
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
 
-MODEL_FLAX_WITH_TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_WITH_TOKENIZER_MAPPING_NAMES)
-
 
 class AutoFlaxLLM(_BaseAutoRunnerFactory):
     _model_mapping = MODEL_FLAX_MAPPING
-
-
-class AutoFlaxLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_FLAX_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/modeling_tf_auto.py b/src/openllm/models/auto/modeling_tf_auto.py
index dbb3c762..668bfe3c 100644
--- a/src/openllm/models/auto/modeling_tf_auto.py
+++ b/src/openllm/models/auto/modeling_tf_auto.py
@@ -21,16 +21,8 @@ from .factory import _BaseAutoRunnerFactory, _LazyAutoMapping
 
 MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5")])
 
-MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5WithTokenizer")])
-
-MODEL_TF_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
-
-MODEL_TF_WITH_TOKENIZER_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_TF_WITH_TOKENIZER_MAPPING_NAMES)
+MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
 
 
 class AutoTFLLM(_BaseAutoRunnerFactory):
     _model_mapping = MODEL_TF_MAPPING
-
-
-class AutoTFLLMWithTokenizer(_BaseAutoRunnerFactory):
-    _model_mapping = MODEL_TF_WITH_TOKENIZER_MAPPING
diff --git a/src/openllm/models/auto/tokenization_auto.py b/src/openllm/models/auto/tokenization_auto.py
deleted file mode 100644
index a7ebfeac..00000000
--- a/src/openllm/models/auto/tokenization_auto.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2023 BentoML Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import typing as t
-from collections import OrderedDict
-
-import openllm
-
-from .configuration_auto import _LazyConfigMapping
-
-TOKENIZER_MAPPING_NAMES = OrderedDict([("flan_t5", "FlanT5Tokenizer")])
-
-TOKENIZER_MAPPING = _LazyConfigMapping(TOKENIZER_MAPPING_NAMES)
-
-
-class AutoTokenizer:
-    def __init__(self):
-        raise EnvironmentError(
-            "This class should not be initialized directly. Instead use 'Tokenizer.create_runner' instead"
-        )
-
-    @classmethod
-    def create_runner(cls, model_name: str, pretrained_or_path: str | None = None, **kwargs: t.Any):
-        model_name = openllm.utils.kebab_to_snake_case(model_name)
-        if model_name in TOKENIZER_MAPPING:
-            tokenizer_class = TOKENIZER_MAPPING[model_name]
-            if pretrained_or_path is None:
-                pretrained_or_path = openllm.utils.get_pretrained_env(tokenizer_class.start_model_name)
-            return tokenizer_class(pretrained_or_path, **kwargs)
-        raise ValueError(
-            f"Unrecognized model {model_name} to build an Tokenizer.\n"
-            f"Model type should be one of {', '.join(TOKENIZER_MAPPING.keys())}."
-        )
diff --git a/src/openllm/models/chatglm/__init__.py b/src/openllm/models/chatglm/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/chatglm/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/dolly_v2/__init__.py b/src/openllm/models/dolly_v2/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/dolly_v2/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/flan_t5/__init__.py b/src/openllm/models/flan_t5/__init__.py
index 567e8c8d..b8e52a44 100644
--- a/src/openllm/models/flan_t5/__init__.py
+++ b/src/openllm/models/flan_t5/__init__.py
@@ -21,7 +21,7 @@ from openllm.utils import import_utils_shim as imports
 
 _import_structure = {
     "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-    "service_flan_t5": ["svc", "model_runner", "tokenizer_runner", "generate"],
+    "service_flan_t5": ["svc", "model_runner", "generate"],
 }
 
 try:
@@ -30,7 +30,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flan_t5"] = ["FlanT5", "FlanT5WithTokenizer", "FlanT5Tokenizer"]
+    _import_structure["modeling_flan_t5"] = ["FlanT5"]
 
 try:
     if not imports.is_flax_available():
@@ -38,7 +38,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5", "FlaxFlanT5WithTokenizer"]
+    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
 
 try:
     if not imports.is_tf_available():
@@ -46,7 +46,7 @@ try:
 except openllm.exceptions.MissingDependencyError:
     pass
 else:
-    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5", "TFFlanT5WithTokenizer"]
+    _import_structure["modeling_flax_flan_t5"] = ["TFFlanT5"]
 
 
 if t.TYPE_CHECKING:
@@ -55,6 +55,7 @@ if t.TYPE_CHECKING:
     from .configuration_flan_t5 import \
         START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
     from .configuration_flan_t5 import FlanT5Config as FlanT5Config
+    from .service_flan_t5 import svc as svc
 
     try:
         if not imports.is_torch_available():
@@ -63,9 +64,6 @@ if t.TYPE_CHECKING:
         pass
     else:
         from .modeling_flan_t5 import FlanT5 as FlanT5
-        from .modeling_flan_t5 import FlanT5Tokenizer as FlanT5Tokenizer
-        from .modeling_flan_t5 import \
-            FlanT5WithTokenizer as FlanT5WithTokenizer
 
     try:
         if not imports.is_flax_available():
@@ -74,8 +72,6 @@ if t.TYPE_CHECKING:
         pass
     else:
         from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
-        from .modeling_flax_flan_t5 import \
-            FlaxFlanT5WithTokenizer as FlaxFlanT5WithTokenizer
 
     try:
         if not imports.is_tf_available():
@@ -84,8 +80,6 @@ if t.TYPE_CHECKING:
         pass
     else:
         from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
-        from .modeling_tf_flan_t5 import \
-            TFFlanT5WithTokenizer as TFFlanT5WithTokenizer
 else:
     import sys
 
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index fd1d0c54..0758d379 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -17,8 +17,7 @@ import typing as t
 
 import openllm
 
-from ...runner_utils import (LLMRunnable, assign_start_model_name,
-                             generate_tokenizer_runner)
+from ...runner_utils import LLMRunnable
 from .configuration_flan_t5 import FlanT5Config
 
 if t.TYPE_CHECKING:
@@ -62,37 +61,11 @@ def import_model(
         return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
 
 
-def _FlanT5Tokenizer(
-    pretrained_or_path: str | None = None, embedded: bool = True, **kwargs: t.Any
-) -> openllm.types.TokenizerRunner:
-    """Get the runner for the tokenizer.
-
-    Args:
-        model_name: The name of the FLAN-T5 model to import.
-        embedded: Whether to use the embedded runner or not.
-        **kwargs: Additional kwargs to pass to the ``transformers.AutoTokenizer`` constructors.
-
-    Returns:
-        The runner for the tokenizer.
-    """
-    if pretrained_or_path is None:
-        pretrained_or_path = FlanT5.default_model
-
-    return generate_tokenizer_runner(
-        import_model(pretrained_or_path, **kwargs).custom_objects["tokenizer"], embedded=embedded
-    )
-
-
-FlanT5Tokenizer = assign_start_model_name("flan-t5")(_FlanT5Tokenizer)
-
-
-class FlanT5(
-    LLMRunnable[transformers.T5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class FlanT5(LLMRunnable, start_model_name="flan-t5"):
     default_model: str = "google/flan-t5-large"
     config_class = FlanT5Config
 
-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True
 
     _llm_config: FlanT5Config
 
@@ -106,9 +79,10 @@ class FlanT5(
         "google/flan-t5-xxl",
     ]
 
+    @torch.inference_mode()
     def _generate(
         self,
-        input_ids: torch.Tensor,
+        prompt: str,
         max_length: int | None = None,
         do_sample: bool = True,
         temperature: float | None = None,
@@ -117,7 +91,9 @@ class FlanT5(
         repetition_penalty: float | None = None,
         **kwargs: t.Any,
     ) -> torch.Tensor:
-        return self.model.generate(
+        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+        input_ids = input_ids.to(self.device)
+        outputs = self.model.generate(
             input_ids,
             max_length=max_length if max_length is not None else self._llm_config.max_length,
             do_sample=do_sample,
@@ -129,15 +105,4 @@ class FlanT5(
             else self._llm_config.repetition_penalty,
             **kwargs,
         )
-
-
-class FlanT5WithTokenizer(FlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids: torch.Tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
-        input_ids = input_ids.to(self.device)
-        outputs = super()._generate(input_ids, **kwargs)
         return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 4fb1abd8..b742241d 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -61,13 +61,11 @@ def import_model(
         return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
 
 
-class FlaxFlanT5(
-    LLMRunnable[transformers.FlaxT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class FlaxFlanT5(LLMRunnable, start_model_name="flan-t5"):
     default_model: str = "google/flan-t5-large"
     config_class = FlanT5Config
 
-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True
 
     _llm_config: FlanT5Config
 
@@ -81,7 +79,7 @@ class FlaxFlanT5(
 
     def _generate(
         self,
-        input_ids: jnp.ndarray,
+        prompt: str,
         max_length: int | None = None,
         do_sample: bool = True,
         temperature: float | None = None,
@@ -90,7 +88,8 @@ class FlaxFlanT5(
         repetition_penalty: float | None = None,
         **kwargs: t.Any,
     ) -> jnp.ndarray:
-        return self.model.generate(
+        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
+        outputs = self.model.generate(
             input_ids,
             max_length=max_length if max_length is not None else self._llm_config.max_length,
             do_sample=do_sample,
@@ -102,16 +101,6 @@ class FlaxFlanT5(
             else self._llm_config.repetition_penalty,
             **kwargs,
         )
-
-
-class FlaxFlanT5WithTokenizer(FlaxFlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
-        outputs = super()._generate(input_ids, **kwargs)
         return self.tokenizer.batch_decode(
             outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
         )
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index da0a999a..6dd45e5e 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -60,13 +60,11 @@ def import_model(
         return bentoml.transformers.save_model(str(tag), model, custom_objects={"tokenizer": tokenizer})
 
 
-class TFFlanT5(
-    LLMRunnable[transformers.TFT5ForConditionalGeneration, transformers.T5TokenizerFast], start_model_name="flan-t5"
-):
+class TFFlanT5(LLMRunnable, start_model_name="flan-t5"):
     default_model: str = "google/flan-t5-large"
     config_class = FlanT5Config
 
-    ATTACH_TOKENIZER = False
+    ATTACH_TOKENIZER = True
 
     _llm_config: FlanT5Config
 
@@ -80,7 +78,7 @@ class TFFlanT5(
 
     def _generate(
         self,
-        input_ids: tf.Tensor,
+        prompt: str,
         max_length: int | None = None,
         do_sample: bool = True,
         temperature: float | None = None,
@@ -89,7 +87,8 @@ class TFFlanT5(
         repetition_penalty: float | None = None,
         **kwargs: t.Any,
     ) -> tf.Tensor:
-        return self.model.generate(
+        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
+        outputs = self.model.generate(
             input_ids,
             max_length=max_length if max_length is not None else self._llm_config.max_length,
             do_sample=do_sample,
@@ -101,14 +100,4 @@ class TFFlanT5(
             else self._llm_config.repetition_penalty,
             **kwargs,
         )
-
-
-class TFFlanT5WithTokenizer(TFFlanT5, start_model_name="flan-t5"):
-    default_model: str = "google/flan-t5-large"
-
-    ATTACH_TOKENIZER = True
-
-    def _generate(self, prompt: str, **kwargs: t.Any) -> list[str]:
-        input_ids: tf.Tensor = self.tokenizer(prompt, return_tensors="tf").input_ids
-        outputs = super()._generate(input_ids, **kwargs)
         return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/service_flan_t5.py b/src/openllm/models/flan_t5/service_flan_t5.py
index ccc54f80..361e249e 100644
--- a/src/openllm/models/flan_t5/service_flan_t5.py
+++ b/src/openllm/models/flan_t5/service_flan_t5.py
@@ -31,9 +31,8 @@ else:
     raise ValueError(f"Invalid framework {framework}")
 
 model_runner = klass.create_runner("flan-t5")
-tokenizer_runner = openllm.AutoTokenizer.create_runner("flan-t5")
 
-svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner, tokenizer_runner])
+svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), runners=[model_runner])
 
 
 @svc.api(
@@ -43,17 +42,7 @@ svc = bentoml.Service(name=openllm.utils.generate_service_name(model_runner), ru
 async def generate(qa: openllm.schema.GenerateInput) -> openllm.schema.GenerateOutput:
     """Returns the generated text from given prompts."""
     llm_config = model_runner.llm_config.with_options(**qa.llm_config).dict()
-
-    return_tensors = "np" if framework == "flax" else framework
-    input_tensor = await tokenizer_runner.async_run(qa.prompt, return_tensors=return_tensors)
-    if framework == "flax":
-        outputs = await model_runner.generate.async_run(input_tensor["input_ids"], **llm_config)
-        responses = await tokenizer_runner.batch_decode.async_run(
-            outputs.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-    else:
-        outputs = await model_runner.generate.async_run(input_tensor.input_ids, **llm_config)
-        responses = await tokenizer_runner.batch_decode.async_run(outputs, skip_special_tokens=True)
+    responses = await model_runner.generate.async_run(qa.prompt, **llm_config)
     return openllm.schema.GenerateOutput(responses=responses, configuration=llm_config)
 
 
diff --git a/src/openllm/models/gpt_neox/__init__.py b/src/openllm/models/gpt_neox/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/gpt_neox/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/gptj/__init__.py b/src/openllm/models/gptj/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/gptj/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/llama/__init__.py b/src/openllm/models/llama/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/llama/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/roberta/__init__.py b/src/openllm/models/roberta/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/roberta/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/models/stablelm/__init__.py b/src/openllm/models/stablelm/__init__.py
new file mode 100644
index 00000000..249445f3
--- /dev/null
+++ b/src/openllm/models/stablelm/__init__.py
@@ -0,0 +1 @@
+raise NotImplementedError("This module is not implemented yet.")
diff --git a/src/openllm/runner_utils.py b/src/openllm/runner_utils.py
index 3b81c95b..578c29e6 100644
--- a/src/openllm/runner_utils.py
+++ b/src/openllm/runner_utils.py
@@ -44,9 +44,6 @@ else:
 
 logger = logging.getLogger(__name__)
 
-M = t.TypeVar("M")
-T = t.TypeVar("T")
-
 
 def assign_start_model_name(start_model_name: str) -> t.Callable[..., t.Any]:
     def wrapper(fn: t.Callable[..., t.Any]) -> t.Callable[..., t.Any]:
@@ -156,7 +153,7 @@ class BaseLLMRunnable(bentoml.Runnable, ABC):
 
 
 # TODO: Add support for model validation
-class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
+class LLMRunnable(BaseLLMRunnable):
     # The section below defines a loose contract with langchain's LLM interface.
     @property
     def _llm_type(self) -> str:
@@ -173,8 +170,8 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
 
     # XXX: INTERNAL
     _module: LLMModuleType
-    _model: M | None = None
-    _tokenizer: T | None = None
+    _model: t.Any | None = None
+    _tokenizer: t.Any | None = None
 
     def __setattr__(self, attr_name: str, value: t.Any) -> None:
         if attr_name in ("ATTACH_TOKENIZER",):
@@ -249,18 +246,18 @@ class LLMRunnable(BaseLLMRunnable, t.Generic[M, T]):
         return super().__getattribute__(item)
 
     @classmethod
-    def dummy_object(cls) -> LLMRunnable[M, T]:
+    def dummy_object(cls) -> LLMRunnable:
         return cls(_dummy=True, _internal=True)
 
     @property
-    def model(self) -> M:
+    def model(self) -> t.Any:
         # NOTE: should we have support for nested runner here?
         if self._model is None:
             self._model = self._bentomodel.load_model()
         return self._model
 
     @property
-    def tokenizer(self) -> T:
+    def tokenizer(self) -> t.Any:
         # This is the runner generated from the bento model. This can
         # then be used for implementation of _generate.
         if self._tokenizer is None:
@@ -368,7 +365,7 @@ class LLMRunner(bentoml.Runner):
 
     def __init__(
         self,
-        runnable_class: type[LLMRunnable[t.Any, t.Any]],
+        runnable_class: type[LLMRunnable],
         llm_config: LLMConfig,
         **kwargs: t.Any,
     ):
diff --git a/src/openllm/types.py b/src/openllm/types.py
index dab968c6..bb1c1acb 100644
--- a/src/openllm/types.py
+++ b/src/openllm/types.py
@@ -48,14 +48,10 @@ class LLMModuleType(LazyLoader):
     ) -> bentoml.Model:
         ...
 
-    class LLMConfigImpl(LLMConfig):
+    class LLMConfigImpl(LLMConfig, model_name="dummy"):
         ...
 
-    class LLMRunnableImpl(LLMRunnable[t.Any, t.Any], start_model_name="dummy"):
-        ...
-
-    @staticmethod
-    def RunnableNameTokenizer(model_name: str | None = None, embedded: bool = True, **kwargs: t.Any) -> TokenizerRunner:
+    class LLMRunnableImpl(LLMRunnable, start_model_name="dummy"):
         ...