diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8fabc34d..ba99309e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ ci:
 exclude: '.*\.(css|js|svg)$'
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.0.278'
+    rev: 'v0.0.280'
     hooks:
       - id: ruff
         args: [--exit-non-zero-on-fix, --show-fixes]
@@ -28,6 +28,8 @@ repos:
     rev: 23.7.0
     hooks:
       - id: black-jupyter
+        args: [--config=pyproject.toml]
+        exclude: (?x)^(src/openllm/models/.*)$
   - repo: https://github.com/econchick/interrogate
     rev: 1.5.0
     hooks:
@@ -50,7 +52,6 @@ repos:
               tools/.*|
               tests/.*|
               src/openllm/playground/.*|
-              src/openllm/models/.*|
               .github/.*
           )$
         additional_dependencies: ["mypy==1.4.1", "types-tabulate", "types-Deprecated", "types-PyYAML", "types-decorator", "types-protobuf", "types-python-dateutil", "types-requests", "types-setuptools", "types-six", "types-ujson", "pandas-stubs", "types-Pillow", "types-Pygments", "types-appdirs", "types-colorama", "types-google-cloud-ndb", "types-jsonschema", "types-psutil", "types-pywin32", "types-tqdm", "types-openpyxl"]
diff --git a/README.md b/README.md
index 65eaeb57..d5a85db4 100644
--- a/README.md
+++ b/README.md
@@ -299,7 +299,7 @@ pip install "openllm[mpt]"
 <tr>
 
 <td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
-<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.MPTForCausalLM><code>MPTForCausalLM</code></a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
 <td>✅</td>
 <td>✅</td>
 <td>
diff --git a/changelog.d/133.feature.md b/changelog.d/133.feature.md
new file mode 100644
index 00000000..a86d1a64
--- /dev/null
+++ b/changelog.d/133.feature.md
@@ -0,0 +1,14 @@
+APIs for LLMService are now provisional based on the capabilities of the LLM.
+
+The following APIs are considered provisional:
+
+- `/v1/embeddings`: This will be available if the LLM supports embeddings (i.e: ``LLM.embeddings`` is implemented. Example model are ``llama``)
+- `/hf/agent`: This will be available if LLM supports running HF agents (i.e: ``LLM.generate_one`` is implemented. Example model are ``starcoder``, ``falcon``.)
+- `POST /v1/adapters` and `GET /v1/adapters`: This will be available if the server is running with LoRA weights
+
+``openllm.LLMRunner`` now include three additional boolean:
+- `runner.supports_embeddings`: Whether this runner supports embeddings
+- `runner.supports_hf_agent`: Whether this runner support HF agents
+- `runner.has_adapters`: Whether this runner is loaded with LoRA adapters.
+
+Optimized ``openllm.models``'s bytecode performance
diff --git a/pyproject.toml b/pyproject.toml
index 0d6f6d54..5b03d094 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,7 +157,7 @@ python_files = ["test_*.py", "*_test.py"]
 testpaths = ["tests"]
 
 [tool.black]
-exclude = '''
+extend-exclude = '''
 (
   /(
       \.eggs
@@ -174,14 +174,15 @@ exclude = '''
     | tools
   )/
   | src/openllm/__about__.py
+  | src/openllm/models
 )
 '''
 line-length = 119
 target-version = ["py38", "py39", "py310", "py311"]
 
 [tool.ruff]
-exclude = ["tools", "src/openllm/playground"]
-extend-include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
+extend-exclude = ["tools", "src/openllm/playground", "src/openllm/models", "src/openllm/_types.py"]
+extend-include = ["*.ipynb"]
 extend-select = [
     "B",    # flake8-bugbear
     "I",    # isort
@@ -223,12 +224,14 @@ ignore = [
     "TCH004",  # don't move runtime import out, just warn about it
     "RUF012",  # mutable attributes to be used with ClassVar
     "B905",    # zip warning about strict, only applicable for 3.10+
+    "D105",    # magic docstring
 ]
 line-length = 119
 target-version = "py312"
 unfixable = [
     "F401",   # Don't touch unused imports, just warn about it.
     "TCH004", # Don't touch import outside of TYPE_CHECKING block
+    "RUF100", # unused noqa, just warn about it
 ]
 [tool.ruff.flake8-type-checking]
 exempt-modules = ["typing", "typing_extensions", "."]
@@ -255,17 +258,9 @@ avoid-escape = false
 # Tests can use magic values, assertions, and relative imports
 "__init__.py" = ["E402", "F401", "F403", "F811"]
 "examples/**/*" = ["D"]
-"src/openllm/_llm.py" = ["B010", "B009"]
-"src/openllm/_strategies.py" = ["B904"]
-"src/openllm/_types.py" = ["E402"]
 "src/openllm/cli.py" = ["D301", "S101"]
-"src/openllm/models/**/*" = ["D106", "S101", "D104"]
-"src/openllm/playground/**/*" = ["E402", "F401", "PLR", "D"]
 "src/openllm/utils/dummy_*" = ["D107"]
-"src/openllm/utils/import_utils.py" = [
-    "PLW0603", # OK to ignore global access here
-    "D105",    # magic docstring
-]
+"src/openllm/utils/import_utils.py" = ["PLW0603"]
 "src/openllm_client/runtimes/*" = ["D107"]
 "tests/**/*" = [
     "S101",
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 0c29ac0a..65fc5a92 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -28,6 +28,7 @@ from abc import abstractmethod
 from pathlib import Path
 
 import attr
+import inflection
 import orjson
 from huggingface_hub import hf_hub_download
 
@@ -82,6 +83,7 @@ if t.TYPE_CHECKING:
     from ._configuration import PeftType
     from ._types import AdaptersMapping
     from ._types import AdaptersTuple
+    from ._types import AnyCallable
     from ._types import DictStrAny
     from ._types import ListStr
     from ._types import LiteralRuntime
@@ -161,13 +163,12 @@ def make_tag(
             model_version = tag.version
             model_name = tag.name
         else:
-            if model_version is None:  # noqa: PLR5501
-                if not quiet:
-                    logger.warning(
-                        "Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
-                        model_id,
-                    )
-                model_version = generate_hash_from_file(model_id)
+            if not quiet and model_version is None:
+                logger.warning(
+                    "Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
+                    model_id,
+                )
+            model_version = first_not_none(model_version, default=generate_hash_from_file(model_id))
     else:
         config = t.cast(
             "transformers.PretrainedConfig",
@@ -418,6 +419,15 @@ class LLMInterface(ABC, t.Generic[M, T]):
     __llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
     """A reference to the the cached LoRA adapter mapping."""
 
+    __llm_supports_embeddings__: bool
+    """A boolean to determine whether models does implement ``LLM.embeddings``."""
+    __llm_supports_generate__: bool
+    """A boolean to determine whether models does implement ``LLM.generate``."""
+    __llm_supports_generate_one__: bool
+    """A boolean to determine whether models does implement ``LLM.generate_one``."""
+    __llm_supports_generate_iterator__: bool
+    """A boolean to determine whether models does implement ``LLM.generate_iterator``."""
+
     if t.TYPE_CHECKING and not MYPY:
 
         def __attrs_init__(
@@ -528,6 +538,21 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]):
     return wrapper
 
 
+def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
+    # update docstring for given entrypoint
+    original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
+    original_fn.__doc__ = (
+        original_fn.__doc__
+        or f"""\
+    {cls.__name__}'s implementation for {fn}.
+
+    Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
+    The original model can then be accessed with 'self.model.get_base_model()'.
+    """
+    )
+    setattr(cls, fn, original_fn)
+
+
 def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
     attributes = {
         "import_model": _wrapped_import_model,
@@ -539,7 +564,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
     args: ListStr = []
     anns: DictStrAny = {}
     lines: ListStr = []
-    globs: DictStrAny = {"cls": cls, "_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface)}
+    globs: DictStrAny = {
+        "cls": cls,
+        "_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface),
+        "__gen_docstring": _update_docstring,
+    }
     # function initialisation
     for func, impl in attributes.items():
         impl_name = f"__wrapped_{func}"
@@ -561,9 +590,22 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
     interface_anns = codegen.get_annotations(LLMInterface)
     for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
         lines.append(_setattr_class(f"__llm_{v}__", None))
-        anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
+        anns[f"__llm_{v}__"] = interface_anns.get(f"__llm_{v}__")
 
-    return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
+    # boolean to determine whether LLM has defined an implementation for a function
+    for fn in {"generate", "generate_one", "generate_iterator", "embeddings"}:
+        key = f"__llm_supports_{fn}__"
+        lines.extend(
+            [
+                _setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"),
+                f"__gen_docstring(cls, '{fn}')",
+            ]
+        )
+        anns[key] = interface_anns.get(key)
+
+    return codegen.generate_function(
+        cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns
+    )
 
 
 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
@@ -607,28 +649,24 @@ class LLM(LLMInterface[M, T], ReprMixin):
         implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
         cls.__llm_implementation__ = implementation
         config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
-
         if "__openllm_internal__" in cd:
             if "config_class" not in cd:
                 cls.config_class = config_class
         elif "config_class" not in cd:
             raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
-
         _make_assignment_script(cls)(cls)
 
-        # update docstring for given entrypoint
-        for fn in {"generate", "generate_one", "generate_iterator"}:
-            original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
-            original_fn.__doc__ = (
-                original_fn.__doc__
-                or f"""\
-            '{fn}' implementation {cls.__name__}.
-
-            Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
-            The original can then be accessed with 'self.model.get_base_model()'.
-            """
-            )
-            setattr(cls, fn, original_fn)
+    def __getitem__(self, item: t.LiteralString | t.Any) -> t.Any:
+        if item is None:
+            raise TypeError(f"{self} doesn't understand how to index None.")
+        item = inflection.underscore(item)
+        internal_attributes = f"__llm_{item}__"
+        if hasattr(self, internal_attributes):
+            return getattr(self, internal_attributes)
+        elif hasattr(self, item):
+            return getattr(self, item)
+        else:
+            raise KeyError(item)
 
     @classmethod
     @overload
@@ -1667,6 +1705,9 @@ def llm_runner_class(self: openllm.LLM[M, T]) -> type[LLMRunner]:
                 "__repr__": ReprMixin.__repr__,
                 "__repr_keys__": property(_wrapped_repr_keys),
                 "__repr_args__": _wrapped_repr_args,
+                "supports_embeddings": self["supports-embeddings"],
+                "supports_hf_agent": self["supports-generate-one"],
+                "has_adapters": self._adapters_mapping is not None,
             }
         ),
     )
diff --git a/src/openllm/_schema.py b/src/openllm/_schema.py
index deb86306..46d53d54 100644
--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -94,6 +94,8 @@ class MetadataOutput:
     model_name: str
     framework: str
     configuration: str
+    supports_embeddings: bool
+    supports_hf_agent: bool
 
 
 @attr.frozen(slots=True)
diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index bed7ea45..924a5403 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -89,48 +89,6 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
     return openllm.GenerationOutput(responses=responses, configuration=config)
 
 
-@svc.api(
-    input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
-    output=bentoml.io.JSON.from_sample(
-        sample={
-            "embeddings": [
-                0.007917795330286026,
-                -0.014421648345887661,
-                0.00481307040899992,
-                0.007331526838243008,
-                -0.0066398633643984795,
-                0.00945580005645752,
-                0.0087016262114048,
-                -0.010709521360695362,
-                0.012635177001357079,
-                0.010541186667978764,
-                -0.00730888033285737,
-                -0.001783102168701589,
-                0.02339819073677063,
-                -0.010825827717781067,
-                -0.015888236463069916,
-                0.01876218430697918,
-                0.0076906150206923485,
-                0.0009032754460349679,
-                -0.010024012066423893,
-                0.01090280432254076,
-                -0.008668390102684498,
-                0.02070549875497818,
-                0.0014594447566196322,
-                -0.018775740638375282,
-                -0.014814382418990135,
-                0.01796768605709076,
-            ],
-            "num_tokens": 20,
-        }
-    ),
-    route="/v1/embeddings",
-)
-async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
-    responses = await runner.embeddings.async_run(phrases)
-    return openllm.EmbeddingsOutput(embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"])
-
-
 @svc.api(
     input=bentoml.io.Text(),
     output=bentoml.io.JSON.from_sample(
@@ -151,42 +109,96 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
         model_name=llm_config["model_name"],
         framework=llm_config["env"]["framework_value"],
         configuration=llm_config.model_dump_json().decode(),
+        supports_embeddings=runner.supports_embeddings,
+        supports_hf_agent=runner.supports_hf_agent,
     )
 
 
-@svc.api(
-    input=bentoml.io.Text.from_sample(sample="default"),
-    output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
-    route="/v1/adapters",
-)
-async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
-    return await runner.set_adapter.async_run(adapter_name)
+if runner.supports_embeddings:
+
+    @svc.api(
+        input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
+        output=bentoml.io.JSON.from_sample(
+            sample={
+                "embeddings": [
+                    0.007917795330286026,
+                    -0.014421648345887661,
+                    0.00481307040899992,
+                    0.007331526838243008,
+                    -0.0066398633643984795,
+                    0.00945580005645752,
+                    0.0087016262114048,
+                    -0.010709521360695362,
+                    0.012635177001357079,
+                    0.010541186667978764,
+                    -0.00730888033285737,
+                    -0.001783102168701589,
+                    0.02339819073677063,
+                    -0.010825827717781067,
+                    -0.015888236463069916,
+                    0.01876218430697918,
+                    0.0076906150206923485,
+                    0.0009032754460349679,
+                    -0.010024012066423893,
+                    0.01090280432254076,
+                    -0.008668390102684498,
+                    0.02070549875497818,
+                    0.0014594447566196322,
+                    -0.018775740638375282,
+                    -0.014814382418990135,
+                    0.01796768605709076,
+                ],
+                "num_tokens": 20,
+            }
+        ),
+        route="/v1/embeddings",
+    )
+    async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+        responses = await runner.embeddings.async_run(phrases)
+        return openllm.EmbeddingsOutput(
+            embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"]
+        )
 
 
-@attr.define
-class HfAgentInput:
-    inputs: str
-    parameters: t.Dict[str, t.Any]
+if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
 
+    @attr.define
+    class HfAgentInput:
+        inputs: str
+        parameters: t.Dict[str, t.Any]
 
-async def hf_agent(request: Request) -> Response:
-    json_str = await request.body()
-    try:
-        input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
-    except orjson.JSONDecodeError as err:
-        raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
+    async def hf_agent(request: Request) -> Response:
+        json_str = await request.body()
+        try:
+            input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
+        except orjson.JSONDecodeError as err:
+            raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
 
-    stop = input_data.parameters.pop("stop", ["\n"])
-    try:
-        resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
-        return JSONResponse(resp, status_code=200)
-    except NotImplementedError:
-        return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
+        stop = input_data.parameters.pop("stop", ["\n"])
+        try:
+            resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
+            return JSONResponse(resp, status_code=200)
+        except NotImplementedError:
+            return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
 
+    hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
 
-hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
+    svc.mount_asgi_app(hf_app, path="/hf")
 
-svc.mount_asgi_app(hf_app, path="/hf")
+if runner.has_adapters:
+
+    @svc.api(
+        input=bentoml.io.Text.from_sample(sample="default"),
+        output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
+        route="/v1/adapters",
+    )
+    async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
+        return await runner.set_adapter.async_run(adapter_name)
+
+else:
+
+    async def adapters_v1(_: Request) -> Response:
+        return JSONResponse({"success": False, "message": "No available adapters for current running server"})
 
 
 async def list_adapter_v1(_: Request) -> Response:
@@ -198,5 +210,8 @@ async def list_adapter_v1(_: Request) -> Response:
     return JSONResponse(res, status_code=200)
 
 
-metadata_app = Starlette(debug=True, routes=[Route("/adapters", list_adapter_v1, methods=["GET"])])
-svc.mount_asgi_app(metadata_app, path="/v1")
+adapters_routes_v1 = [Route("/adapters", list_adapter_v1, methods=["GET"])]
+if not runner.has_adapters:
+    adapters_routes_v1.append(Route("/adapters", adapters_v1, methods=["POST"]))
+adapters_app_v1 = Starlette(debug=True, routes=adapters_routes_v1)
+svc.mount_asgi_app(adapters_app_v1, path="/v1")
diff --git a/src/openllm/_types.py b/src/openllm/_types.py
index 769483bb..d5bfefd7 100644
--- a/src/openllm/_types.py
+++ b/src/openllm/_types.py
@@ -145,6 +145,10 @@ class LLMRunner(bentoml.Runner):
     generate_one: RunnerMethod[LLMRunnable, [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
     generate_iterator: RunnerMethod[LLMRunnable, [str], t.Generator[t.Any, None, None]]
 
+    supports_embeddings: bool
+    supports_hf_agent: bool
+    has_adapters: bool
+
     def __init__(
         self,
         runnable_class: type[LLMRunnable],
diff --git a/src/openllm/cli.py b/src/openllm/cli.py
index ea9b5eb2..31f5d1aa 100644
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -34,6 +34,7 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
 """
 from __future__ import annotations
 import functools
+import http.client
 import importlib.machinery
 import importlib.util
 import inspect
@@ -470,9 +471,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
         return super().get_command(ctx, cmd_name)
 
     def list_commands(self, ctx: click.Context) -> list[str]:
-        if ctx.command.name == "start" or ctx.command.name == "start-grpc":
+        if ctx.command.name in {"start", "start-grpc"}:
             return list(openllm.CONFIG_MAPPING.keys())
-
         return super().list_commands(ctx)
 
     @override
@@ -883,7 +883,7 @@ def prerequisite_check(
 
     requirements = llm_config["requirements"]
     if requirements is not None and len(requirements) > 0:
-        missing_requirements = [i for i in requirements if importlib.util.find_spec(i) is None]
+        missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
         if len(missing_requirements) > 0:
             _echo(
                 f"Make sure to have the following dependencies available: {missing_requirements}",
@@ -2339,6 +2339,11 @@ def instruct(
     """
     client = openllm.client.HTTPClient(endpoint, timeout=timeout)
 
+    try:
+        client.call("metadata")
+    except http.client.BadStatusLine:
+        raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
+
     if agent == "hf":
         if not is_transformers_supports_agent():
             raise click.UsageError(
diff --git a/src/openllm/models/baichuan/__init__.py b/src/openllm/models/baichuan/__init__.py
index 02a75b22..3a210e92 100644
--- a/src/openllm/models/baichuan/__init__.py
+++ b/src/openllm/models/baichuan/__init__.py
@@ -11,41 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_cpm_kernels_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available() or not is_cpm_kernels_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_baichuan"] = ["Baichuan"]
-
+    if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_baichuan"] = ["Baichuan"]
 if t.TYPE_CHECKING:
     from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_baichuan import START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
     from .configuration_baichuan import BaichuanConfig as BaichuanConfig
 
     try:
-        if not is_torch_available() or not is_cpm_kernels_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_baichuan import Baichuan as Baichuan
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_baichuan import Baichuan as Baichuan
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/baichuan/configuration_baichuan.py b/src/openllm/models/baichuan/configuration_baichuan.py
index 6042c429..a1fe5c95 100644
--- a/src/openllm/models/baichuan/configuration_baichuan.py
+++ b/src/openllm/models/baichuan/configuration_baichuan.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class BaichuanConfig(openllm.LLMConfig):
     """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
 
@@ -26,7 +23,6 @@ class BaichuanConfig(openllm.LLMConfig):
     and English benchmarks (C-Eval, MMLU, etc).
     Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "trust_remote_code": True,
@@ -45,13 +41,10 @@ class BaichuanConfig(openllm.LLMConfig):
             "hiyouga/baichuan-7b-sft",
         ],
     }
-
     class GenerationConfig:
         max_new_tokens: int = 2048
         top_p: float = 0.7
         temperature: float = 0.95
-
-
 START_BAICHUAN_COMMAND_DOCSTRING = """\
 Run a LLMServer for Baichuan model.
 
@@ -71,5 +64,4 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
 \b
 $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/baichuan/modeling_baichuan.py b/src/openllm/models/baichuan/modeling_baichuan.py
index 058bd2f3..b7cb1295 100644
--- a/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/src/openllm/models/baichuan/modeling_baichuan.py
@@ -13,69 +13,31 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
 if t.TYPE_CHECKING:
     import torch
-
     import transformers
 else:
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
 class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
     __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        top_p: float | None = None,
-        temperature: float | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
         # NOTE: The rest of attrs should be kwargs for GenerationConfig
-        generate_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "top_p": top_p,
-            "temperature": temperature,
-            **attrs,
-        }
-
+        generate_kwargs = {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}
         return prompt_text, generate_kwargs, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
         with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
-            outputs = self.model.generate(
-                **inputs,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+            outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
             return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
diff --git a/src/openllm/models/chatglm/__init__.py b/src/openllm/models/chatglm/__init__.py
index 338da543..00457a4c 100644
--- a/src/openllm/models/chatglm/__init__.py
+++ b/src/openllm/models/chatglm/__init__.py
@@ -11,41 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_cpm_kernels_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available() or not is_cpm_kernels_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_chatglm"] = ["ChatGLM"]
-
+    if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_chatglm"] = ["ChatGLM"]
 if t.TYPE_CHECKING:
     from .configuration_chatglm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_chatglm import START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
     from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
-
     try:
-        if not is_torch_available() or not is_cpm_kernels_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_chatglm import ChatGLM as ChatGLM
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_chatglm import ChatGLM as ChatGLM
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/chatglm/configuration_chatglm.py b/src/openllm/models/chatglm/configuration_chatglm.py
index b3beae85..711e7d1b 100644
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class ChatGLMConfig(openllm.LLMConfig):
     """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
 
@@ -30,7 +27,6 @@ class ChatGLMConfig(openllm.LLMConfig):
 
     Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "trust_remote_code": True,
@@ -48,22 +44,17 @@ class ChatGLMConfig(openllm.LLMConfig):
             "thudm/chatglm2-6b-int4",
         ],
     }
-
     retain_history: bool = openllm.LLMConfig.Field(
         False,
         description="""Whether to retain history given to the model.
         If set to True, then the model will retain given history.""",
     )
-
     use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-
     class GenerationConfig:
         max_new_tokens: int = 2048
         num_beams: int = 1
         top_p: float = 0.7
         temperature: float = 0.95
-
-
 START_CHATGLM_COMMAND_DOCSTRING = """\
 Run a LLMServer for ChatGLM model.
 
@@ -83,5 +74,4 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
 \b
 $ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/chatglm/modeling_chatglm.py b/src/openllm/models/chatglm/modeling_chatglm.py
index b79f1744..ab01e0b5 100644
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -13,94 +13,34 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
-import bentoml
 import openllm
-
-from ...utils import generate_labels
-
-
 if t.TYPE_CHECKING:
     import torch
-
     import transformers
 else:
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
 class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
     __openllm_internal__ = True
-
-    def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        return bentoml.transformers.save_model(
-            self.tag,
-            transformers.AutoModel.from_pretrained(self.model_id, trust_remote_code=trust_remote_code),
-            labels=generate_labels(self),
-            custom_objects={
-                "tokenizer": transformers.AutoTokenizer.from_pretrained(
-                    self.model_id, trust_remote_code=trust_remote_code, **tokenizer_attrs
-                )
-            },
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        num_beams: int | None = None,
-        top_p: float | None = None,
-        temperature: float | None = None,
-        chat_history: list[str] | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[str] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         prompt_text = ""
-
         if use_default_prompt_template and chat_history is not None:
-            for i, (old_query, response) in enumerate(chat_history):
-                prompt_text += f"[Round {i}]\n问：{old_query}\n答：{response}\n"  # noqa: RUF001
+            for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问：{old_query}\n答：{response}\n"  # noqa: RUF001
             prompt_text += f"[Round {len(chat_history)}]\n问：{prompt}\n答："  # noqa: RUF001
-        else:
-            prompt_text = prompt
-
+        else: prompt_text = prompt
         postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
-
         # NOTE: The rest of attrs should be kwargs for GenerationConfig
-        generate_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "num_beams": num_beams,
-            "top_p": top_p,
-            "temperature": temperature,
-            **attrs,
-        }
-
+        generate_kwargs = {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}
         return prompt_text, generate_kwargs, postprocess_generate_kwargs
-
-    def postprocess_generate(
-        self,
-        prompt: str,
-        generation_result: tuple[str, list[tuple[str, str]]],
-        *,
-        chat_history: list[tuple[str, str]] | None = None,
-        **attrs: t.Any,
-    ):
+    def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any):
         generated, history = generation_result
         if self.config.retain_history:
             assert chat_history is not None, "'retain_history' is True while there is no history provided."
             chat_history.extend(history)
         return generated
-
     def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
         with torch.inference_mode():
             self.model.eval()
             # Only use half precision if the model is not yet quantized
-            if self.config.use_half_precision:
-                self.model.half()
-            return self.model.chat(
-                self.tokenizer,
-                prompt,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+            if self.config.use_half_precision: self.model.half()
+            return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
diff --git a/src/openllm/models/dolly_v2/__init__.py b/src/openllm/models/dolly_v2/__init__.py
index fa80ecbb..7f5a851c 100644
--- a/src/openllm/models/dolly_v2/__init__.py
+++ b/src/openllm/models/dolly_v2/__init__.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_dolly_v2"] = ["DollyV2"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_dolly_v2"] = ["DollyV2"]
 if t.TYPE_CHECKING:
     from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_dolly_v2 import START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
     from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_dolly_v2 import DollyV2 as DollyV2
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_dolly_v2 import DollyV2 as DollyV2
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
index 369f74d9..e0eba513 100644
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -13,14 +13,8 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
-
-if t.TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-
-
+if t.TYPE_CHECKING: import transformers
 class DollyV2Config(openllm.LLMConfig):
     """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
 
@@ -33,7 +27,6 @@ class DollyV2Config(openllm.LLMConfig):
 
     Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
     """
-
     __config__ = {
         "timeout": 3600000,
         "url": "https://github.com/databrickslabs/dolly",
@@ -41,19 +34,15 @@ class DollyV2Config(openllm.LLMConfig):
         "default_id": "databricks/dolly-v2-3b",
         "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
     }
-
     return_full_text: bool = openllm.LLMConfig.Field(
         False, description="Whether to return the full prompt to the users."
     )
-
     class GenerationConfig:
         temperature: float = 0.9
         top_p: float = 0.92
         top_k: int = 5
         max_new_tokens: int = 256
         eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
-
-
 START_DOLLY_V2_COMMAND_DOCSTRING = """\
 Run a LLMServer for dolly-v2 model.
 
@@ -73,14 +62,10 @@ or provide `--model-id` flag when running ``openllm start dolly-v2``:
 \b
 $ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
 """
-
 INSTRUCTION_KEY = "### Instruction:"
 RESPONSE_KEY = "### Response:"
 END_KEY = "### End"
-INTRO_BLURB = (
-    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-)
-
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
 # NOTE: This is the prompt that is used for generating responses using an already
 # trained model.  It ends with the response key, where the job of the model is to provide
 # the completion that follows it (i.e. the response itself).
@@ -88,15 +73,8 @@ DEFAULT_PROMPT_TEMPLATE = """{intro}
 {instruction_key}
 {instruction}
 {response_key}
-""".format(
-    intro=INTRO_BLURB,
-    instruction_key=INSTRUCTION_KEY,
-    instruction="{instruction}",
-    response_key=RESPONSE_KEY,
-)
-
-
-def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
+""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
+def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
     """Gets the token ID for a given string that has been added to the tokenizer as a special token.
 
     When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
@@ -113,6 +91,5 @@ def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
         int: the token ID for the given key.
     """
     token_ids = tokenizer.encode(key)
-    if len(token_ids) > 1:
-        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
+    if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
     return token_ids[0]
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index 0f69aaf5..d08a966c 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -15,288 +15,118 @@ from __future__ import annotations
 import logging
 import re
 import typing as t
-
 import openllm
-
 from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 from .configuration_dolly_v2 import END_KEY
 from .configuration_dolly_v2 import RESPONSE_KEY
 from .configuration_dolly_v2 import get_special_token_id
-
-
 if t.TYPE_CHECKING:
-    import tensorflow as tf
     import torch
-
     import transformers
+    import tensorflow as tf
 else:
     tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
 logger = logging.getLogger(__name__)
-
-
 @t.overload
-def get_pipeline(
-    model: transformers.PreTrainedModel,
-    tokenizer: transformers.PreTrainedTokenizer,
-    _init: t.Literal[True] = True,
-    **attrs: t.Any,
-) -> transformers.Pipeline:
-    ...
-
-
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ...
 @t.overload
-def get_pipeline(
-    model: transformers.PreTrainedModel,
-    tokenizer: transformers.PreTrainedTokenizer,
-    _init: t.Literal[False] = ...,
-    **attrs: t.Any,
-) -> type[transformers.Pipeline]:
-    ...
-
-
-def get_pipeline(
-    model: transformers.PreTrainedModel,
-    tokenizer: transformers.PreTrainedTokenizer,
-    _init: bool = False,
-    **attrs: t.Any,
-) -> type[transformers.Pipeline] | transformers.Pipeline:
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ...
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
     class InstructionTextGenerationPipeline(transformers.Pipeline):
-        def __init__(
-            self,
-            *args: t.Any,
-            do_sample: bool = True,
-            max_new_tokens: int = 256,
-            top_p: float = 0.92,
-            top_k: int = 0,
-            **kwargs: t.Any,
-        ):
-            """Initialize the pipeline.
-
-            Args:
-                do_sample: Whether or not to use sampling. Defaults to True.
-                max_new_tokens: Max new tokens after the prompt to generate. Defaults to 128.
-                top_p: If set to float < 1, only the smallest set of most probable tokens with
-                       probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
-                top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to 0.
-                *args: Additional positional arguments to be passed to ``transformers.Pipeline``.
-                **kwargs: Additional keyword arguments to be passed to ``transformers.Pipeline``.
-            """
-            super().__init__(
-                *args,
-                model=model,
-                tokenizer=tokenizer,
-                do_sample=do_sample,
-                max_new_tokens=max_new_tokens,
-                top_p=top_p,
-                top_k=top_k,
-                **kwargs,
-            )
-
+        def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
         def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
             preprocess_params: dict[str, t.Any] = {}
-
             # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
             # append a newline to yield a single token.  find whatever token is configured for the response key.
-            tokenizer_response_key = next(
-                (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
-            )
-
+            tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
             response_key_token_id = None
             end_key_token_id = None
             if tokenizer_response_key:
                 try:
                     response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                     end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
-
                     # Ensure generation stops once it generates "### End"
                     generate_kwargs["eos_token_id"] = end_key_token_id
-                except ValueError:
-                    pass
-
+                except ValueError: pass
             forward_params = generate_kwargs
             postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id}
-
-            if return_full_text is not None:
-                postprocess_params["return_full_text"] = return_full_text
-
+            if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text
             return preprocess_params, forward_params, postprocess_params
-
         def preprocess(self, input_: str, **generate_kwargs: t.Any):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
             prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
             inputs = self.tokenizer(prompt_text, return_tensors="pt")
             inputs["prompt_text"] = prompt_text
             inputs["instruction_text"] = input_
             return inputs
-
         def _forward(self, model_inputs: dict[str, t.Any], **generate_kwargs: t.Any):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
-            input_ids = model_inputs["input_ids"]
-            attention_mask = model_inputs.get("attention_mask", None)
-
-            if input_ids.shape[1] == 0:
-                input_ids = None
-                attention_mask = None
-                in_b = 1
-            else:
-                in_b = input_ids.shape[0]
-
-            generated_sequence = self.model.generate(
-                input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
-                attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
-                pad_token_id=self.tokenizer.pad_token_id,
-                **generate_kwargs,
-            )
-
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
+            input_ids, attention_mask = model_inputs["input_ids"], model_inputs.get("attention_mask", None)
+            if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
+            else: in_b = input_ids.shape[0]
+            generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs)
             out_b = generated_sequence.shape[0]
-            if self.framework == "pt":
-                generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
-            elif self.framework == "tf":
-                generated_sequence = tf.reshape(
-                    generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])
-                )
-
+            if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+            elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
             instruction_text = model_inputs.pop("instruction_text")
-            return {
-                "generated_sequence": generated_sequence,
-                "input_ids": input_ids,
-                "instruction_text": instruction_text,
-            }
-
-        def postprocess(
-            self,
-            model_outputs: dict[str, t.Any],
-            response_key_token_id: int,
-            end_key_token_id: int,
-            return_full_text: bool = False,
-        ):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
-            generated_sequence = model_outputs["generated_sequence"][0]
-            instruction_text = model_outputs["instruction_text"]
+            return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
 
+        def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False):
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
+            generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"]
             generated_sequence: list[list[int]] = generated_sequence.numpy().tolist()
             records: list[dict[t.Literal["generated_text"], str]] = []
             for sequence in generated_sequence:
                 # The response will be set to this variable if we can identify it.
                 decoded = None
-
                 # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
                 if response_key_token_id and end_key_token_id:
                     # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
                     # prompt, we should definitely find it.  We will return the tokens found after this token.
-                    try:
-                        response_pos = sequence.index(response_key_token_id)
-                    except ValueError:
-                        logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
-                        response_pos = None
-
+                    try: response_pos = sequence.index(response_key_token_id)
+                    except ValueError: response_pos = None
+                    if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
                     if response_pos:
                         # Next find where "### End" is located.  The model has been trained to end its responses with this
                         # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                         # this token, as the response could be truncated.  If we don't find it then just return everything
                         # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
-                        try:
-                            end_pos = sequence.index(end_key_token_id)
-                        except ValueError:
-                            end_pos = None
-
+                        try: end_pos = sequence.index(end_key_token_id)
+                        except ValueError: end_pos = None
                         decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
-
                 if not decoded:
                     # Otherwise we'll decode everything and use a regex to find the response and end.
-
                     fully_decoded = self.tokenizer.decode(sequence)
-
                     # The response appears after "### Response:".  The model has been trained to append "### End" at the
                     # end.
                     m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
-
-                    if m:
-                        decoded = m.group(1).strip()
+                    if m: decoded = m.group(1).strip()
                     else:
                         # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                         # return everything after "### Response:".
                         m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
-                        if m:
-                            decoded = m.group(1).strip()
-                        else:
-                            logger.warning("Failed to find response in:\n%s", fully_decoded)
-
+                        if m: decoded = m.group(1).strip()
+                        else: logger.warning("Failed to find response in:\n%s", fully_decoded)
                 # If the full text is requested, then append the decoded text to the original instruction.
                 # This technically isn't the full text, as we format the instruction in the prompt the model has been
                 # trained on, but to the client it will appear to be the full text.
-                if return_full_text:
-                    decoded = f"{instruction_text}\n{decoded}"
-
+                if return_full_text: decoded = f"{instruction_text}\n{decoded}"
                 rec = {"generated_text": decoded}
-
                 records.append(rec)
-
             return records
 
-    if _init:
-        return InstructionTextGenerationPipeline()
+    if _init: return InstructionTextGenerationPipeline()
     return InstructionTextGenerationPipeline
-
-
 class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]):
     __openllm_internal__ = True
-
     @property
-    def import_kwargs(self):
-        model_kwds = {
-            "device_map": "auto" if torch.cuda.is_available() else None,
-            "torch_dtype": torch.bfloat16,
-        }
-        tokenizer_kwds = {"padding_side": "left"}
-        return model_kwds, tokenizer_kwds
-
-    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
-        return get_pipeline(
-            model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
-            tokenizer=self.tokenizer,
-            _init=True,
-            return_full_text=self.config.return_full_text,
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        # NOTE: The rest of attrs should be kwargs for GenerationConfig
-        generate_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "top_k": top_k,
-            "top_p": top_p,
-            "temperature": temperature,
-            **attrs,
-        }
-
-        return prompt, generate_kwargs, {}
-
-    def postprocess_generate(
-        self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any
-    ) -> str:
-        return generation_result[0]["generated_text"]
-
+    def import_kwargs(self): return {"device_map": "auto" if torch.cuda.is_available() else None, "torch_dtype": torch.bfloat16}, {"padding_side": "left"}
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), tokenizer=self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return prompt, {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
     def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
-        with torch.inference_mode():
-            llm_config = self.config.model_construct_env(**attrs)
-            return self.model(
-                prompt,
-                return_full_text=llm_config.return_full_text,
-                generation_config=llm_config.to_generation_config(),
-            )
+        llm_config = self.config.model_construct_env(**attrs)
+        with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
diff --git a/src/openllm/models/falcon/__init__.py b/src/openllm/models/falcon/__init__.py
index 23305ba7..462f518a 100644
--- a/src/openllm/models/falcon/__init__.py
+++ b/src/openllm/models/falcon/__init__.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_falcon"] = ["Falcon"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_falcon"] = ["Falcon"]
 if t.TYPE_CHECKING:
     from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_falcon import START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
     from .configuration_falcon import FalconConfig as FalconConfig
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_falcon import Falcon as Falcon
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_falcon import Falcon as Falcon
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/falcon/configuration_falcon.py b/src/openllm/models/falcon/configuration_falcon.py
index c5e7e7bb..176cc95e 100644
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class FalconConfig(openllm.LLMConfig):
     """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
 
@@ -23,7 +20,6 @@ class FalconConfig(openllm.LLMConfig):
 
     Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "trust_remote_code": True,
@@ -50,15 +46,12 @@ class FalconConfig(openllm.LLMConfig):
             },
         ),
     }
-
     class GenerationConfig:
         max_new_tokens: int = 200
         top_k: int = 10
         num_return_sequences: int = 1
         num_beams: int = 4
         early_stopping: bool = True
-
-
 START_FALCON_COMMAND_DOCSTRING = """\
 Run a LLMServer for FalconLM model.
 
@@ -78,7 +71,6 @@ or provide `--model-id` flag when running ``openllm start falcon``:
 \b
 $ openllm start falcon --model-id tiiuae/falcon-7b-instruct
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{context}
 {user_name}: {instruction}
 {agent}:
diff --git a/src/openllm/models/falcon/modeling_falcon.py b/src/openllm/models/falcon/modeling_falcon.py
index 8ee85779..342d99ae 100644
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -11,105 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
 if t.TYPE_CHECKING:
     import torch
-
     import transformers
 else:
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
 class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
     __openllm_internal__ = True
-
     @property
-    def import_kwargs(self):
-        model_kwds = {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        eos_token_id: int | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}, {}
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument instead of "
-                    "kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-            "eos_token_id": eos_token_id,
-            **attrs,
-        }
-
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        eos_token_id = attrs.pop("eos_token_id", self.tokenizer.eos_token_id)
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
-            outputs = self.model.generate(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                generation_config=self.config.model_construct_env(
-                    eos_token_id=eos_token_id, **attrs
-                ).to_generation_config(),
-            )
-            return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-    def generate_one(
-        self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
-    ) -> list[dict[t.Literal["generated_text"], str]]:
+        eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env( eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True)
+    def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
         from ..._generation import StopSequenceCriteria
-
-        max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
-        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        src_len = encoded_inputs["input_ids"].shape[1]
-        stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
+        max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
         stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
-        outputs = self.model.generate(
-            encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
-        )
-
-        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
+        result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
         # Inference API returns the stop sequence
         for stop_seq in stop:
-            if result.endswith(stop_seq):
-                result = result[: -len(stop_seq)]
+            if result.endswith(stop_seq): result = result[: -len(stop_seq)]
         return [{"generated_text": result}]
diff --git a/src/openllm/models/flan_t5/__init__.py b/src/openllm/models/flan_t5/__init__.py
index 00b23d10..2d5c97d1 100644
--- a/src/openllm/models/flan_t5/__init__.py
+++ b/src/openllm/models/flan_t5/__init__.py
@@ -13,73 +13,40 @@
 # limitations under the License.
 
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_flax_available
 from ...utils import is_tf_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_flan_t5"] = ["FlanT5"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_flan_t5"] = ["FlanT5"]
 try:
-    if not is_flax_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
-
+    if not is_flax_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
 try:
-    if not is_tf_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
-
-
+    if not is_tf_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
 if t.TYPE_CHECKING:
     from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_flan_t5 import START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
     from .configuration_flan_t5 import FlanT5Config as FlanT5Config
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_flan_t5 import FlanT5 as FlanT5
-
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_flan_t5 import FlanT5 as FlanT5
     try:
-        if not is_flax_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
-
+        if not is_flax_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
     try:
-        if not is_tf_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_tf_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/flan_t5/configuration_flan_t5.py b/src/openllm/models/flan_t5/configuration_flan_t5.py
index c718b16c..8129273b 100644
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -12,10 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
+class FlanT5Config(openllm.LLMConfig):
+    """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
 
+    It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
 
+    Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
+    """
+    __config__ = {
+        "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
+        "default_id": "google/flan-t5-large",
+        "architecture": "T5ForConditionalGeneration",
+        "model_ids": [
+            "google/flan-t5-small",
+            "google/flan-t5-base",
+            "google/flan-t5-large",
+            "google/flan-t5-xl",
+            "google/flan-t5-xxl",
+        ],
+        "model_type": "seq2seq_lm",
+    }
+    class GenerationConfig:
+        temperature: float = 0.9
+        max_new_tokens: int = 2048
+        top_k: int = 50
+        top_p: float = 0.4
+        repetition_penalty = 1.0
 START_FLAN_T5_COMMAND_DOCSTRING = """\
 Run a LLMServer for FLAN-T5 model.
 
@@ -41,35 +64,4 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
 \b
 $ openllm start flan-t5 --model-id google/flan-t5-xxl
 """
-
 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
-
-
-class FlanT5Config(openllm.LLMConfig):
-    """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
-
-    It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
-
-    Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
-    """
-
-    __config__ = {
-        "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
-        "default_id": "google/flan-t5-large",
-        "architecture": "T5ForConditionalGeneration",
-        "model_ids": [
-            "google/flan-t5-small",
-            "google/flan-t5-base",
-            "google/flan-t5-large",
-            "google/flan-t5-xl",
-            "google/flan-t5-xxl",
-        ],
-        "model_type": "seq2seq_lm",
-    }
-
-    class GenerationConfig:
-        temperature: float = 0.9
-        max_new_tokens: int = 2048
-        top_k: int = 50
-        top_p: float = 0.4
-        repetition_penalty = 1.0
diff --git a/src/openllm/models/flan_t5/modeling_flan_t5.py b/src/openllm/models/flan_t5/modeling_flan_t5.py
index 11ef2adc..a8630259 100644
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -13,71 +13,25 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
 if t.TYPE_CHECKING:
     import torch
-
     import transformers  # noqa: F401
 else:
     torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-
-
 class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
     __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-        }
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        with torch.inference_mode():
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
-            result_tensor = self.model.generate(
-                input_ids,
-                do_sample=True,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
-            return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 7e3eafd6..68056019 100644
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -26,64 +26,18 @@ if t.TYPE_CHECKING:
 
 class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
     __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        repetition_penalty: float | None = None,
-        decoder_start_token_id: int | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        if decoder_start_token_id is None:
-            decoder_start_token_id = 0
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-            "decoder_start_token_id": decoder_start_token_id,
-        }
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        if decoder_start_token_id is None: decoder_start_token_id = 0
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        # XXX: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main
-        # as it is required for encoder-decoder generation.
+        # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
         decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
-        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
-        result_tensor = self.model.generate(
-            input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            decoder_start_token_id=decoder_start_token_id,
-        )
-        return self.tokenizer.batch_decode(
-            result_tensor.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
+        return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)
diff --git a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index 34f63082..bd892ed5 100644
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -13,66 +13,20 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import transformers  # noqa: F401
-
-
+if t.TYPE_CHECKING: import transformers  # noqa: F401
 class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
     __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-        }
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
-    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="tf").input_ids
-        outputs = self.model.generate(
-            input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+    def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/src/openllm/models/gpt_neox/__init__.py b/src/openllm/models/gpt_neox/__init__.py
index e531ddab..96aebfaa 100644
--- a/src/openllm/models/gpt_neox/__init__.py
+++ b/src/openllm/models/gpt_neox/__init__.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
 if t.TYPE_CHECKING:
     from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_gpt_neox import START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
     from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_gpt_neox import GPTNeoX as GPTNeoX
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/gpt_neox/configuration_gpt_neox.py b/src/openllm/models/gpt_neox/configuration_gpt_neox.py
index be88eeea..302a912a 100644
--- a/src/openllm/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/configuration_gpt_neox.py
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
-
 import openllm
-
-
 class GPTNeoXConfig(openllm.LLMConfig):
     """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
 
@@ -32,7 +28,6 @@ class GPTNeoXConfig(openllm.LLMConfig):
     Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
     for more information.
     """
-
     __config__ = {
         "model_name": "gpt_neox",
         "start_name": "gpt-neox",
@@ -42,14 +37,10 @@ class GPTNeoXConfig(openllm.LLMConfig):
         "default_id": "eleutherai/gpt-neox-20b",
         "model_ids": ["eleutherai/gpt-neox-20b"],
     }
-
     use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-
     class GenerationConfig:
         temperature: float = 0.9
         max_new_tokens: int = 100
-
-
 START_GPT_NEOX_COMMAND_DOCSTRING = """\
 Run a LLMServer for GPTNeoX model.
 
@@ -69,6 +60,4 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
 \b
 $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
-
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
index 8f573e83..bd38b0a5 100644
--- a/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -11,88 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import openllm
-
 from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-
-
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
     __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        temperature: float | None = None,
-        max_new_tokens: int | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature}
-
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
     @property
-    def import_kwargs(self):
-        model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
     def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
         model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
-        if self.config.use_half_precision:
-            model.half()
+        if self.config.use_half_precision: model.half()
         return model
-
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         from ..._generation import StopOnTokens
-
-        generation_kwargs = {
-            "do_sample": True,
-            "generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
-            "pad_token_id": self.tokenizer.eos_token_id,
-            "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
-        }
-
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.inference_mode():
-            gen_tokens = self.model.generate(inputs.input_ids, **generation_kwargs)
-            return self.tokenizer.batch_decode(gen_tokens)
+        generation_kwargs = {"do_sample": True, "generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "pad_token_id": self.tokenizer.eos_token_id, "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, **generation_kwargs))
diff --git a/src/openllm/models/llama/__init__.py b/src/openllm/models/llama/__init__.py
index 687a249e..c8dbfda5 100644
--- a/src/openllm/models/llama/__init__.py
+++ b/src/openllm/models/llama/__init__.py
@@ -11,64 +11,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
 from ...utils import is_vllm_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_llama": [
-        "LlaMAConfig",
-        "START_LLAMA_COMMAND_DOCSTRING",
-        "DEFAULT_PROMPT_TEMPLATE",
-        "PROMPT_MAPPING",
-    ],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_llama": ["LlaMAConfig", "START_LLAMA_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
 try:
-    if not is_vllm_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
-
+    if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_llama"] = ["LlaMA"]
-
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_llama"] = ["LlaMA"]
 if t.TYPE_CHECKING:
     from .configuration_llama import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_llama import PROMPT_MAPPING as PROMPT_MAPPING
     from .configuration_llama import START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
     from .configuration_llama import LlaMAConfig as LlaMAConfig
-
     try:
-        if not is_vllm_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
-
+        if not is_vllm_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_llama import LlaMA as LlaMA
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_llama import LlaMA as LlaMA
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/llama/configuration_llama.py b/src/openllm/models/llama/configuration_llama.py
index 698e7de3..426ab887 100644
--- a/src/openllm/models/llama/configuration_llama.py
+++ b/src/openllm/models/llama/configuration_llama.py
@@ -11,13 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import typing as t
-
 import openllm
-
-
 class LlaMAConfig(openllm.LLMConfig):
     """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 
@@ -30,11 +26,7 @@ class LlaMAConfig(openllm.LLMConfig):
     Refer to [LlaMA's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
     for more information.
     """
-
-    use_llama2_prompt: bool = openllm.LLMConfig.Field(
-        True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1."
-    )
-
+    use_llama2_prompt: bool = openllm.LLMConfig.Field(True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1.")
     __config__ = {
         "model_name": "llama",
         "start_name": "llama",
@@ -69,18 +61,14 @@ class LlaMAConfig(openllm.LLMConfig):
             },
         ),
     }
-
     class GenerationConfig:
         max_new_tokens: int = 256
         temperature: float = 0.45
         top_p: float = 0.95
         top_k: int = 12
-
     class SamplingParams:
         best_of: int = 1
         presence_penalty: float = 0.5
-
-
 START_LLAMA_COMMAND_DOCSTRING = """\
 Run a LLMServer for LlaMA model.
 
@@ -110,39 +98,14 @@ OpenLLM also supports running LlaMA-2 and its fine-tune and variants. To import
 \b
 $ CONVERTER=hf-llama2 openllm import llama /path/to/llama-2
 """
-
 SYSTEM_MESSAGE = """
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
 """
-
-SINST_KEY = "[INST]"
-EINST_KEY = "[/INST]"
-SYS_KEY = "<<SYS>>"
-EOS_TOKEN = "</s>"
-BOS_TOKEN = "<s>"
-
-# TODO: support history
-_v2_prompt = """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(
-    start_key=SINST_KEY,
-    sys_key=SYS_KEY,
-    system_message=SYSTEM_MESSAGE,
-    instruction="{instruction}",
-    end_key=EINST_KEY,
-)
-
-# XXX: implement me
-_v1_prompt = """{instruction}"""
-
-PROMPT_MAPPING = {
-    "v1": _v1_prompt,
-    "v2": _v2_prompt,
-}
-
-
-def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str:
-    return PROMPT_MAPPING[model_type]
-
-
+SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = "[INST]", "[/INST]", "<<SYS>>", "</s>", "<s>"
+# TODO: support history and v1 prompt implementation
+_v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction="{instruction}", end_key=EINST_KEY)
+PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
+def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
diff --git a/src/openllm/models/llama/modeling_llama.py b/src/openllm/models/llama/modeling_llama.py
index 94a4b677..1ed4ec5b 100644
--- a/src/openllm/models/llama/modeling_llama.py
+++ b/src/openllm/models/llama/modeling_llama.py
@@ -11,110 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import openllm
-
 from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
 from ..._llm import LLMEmbeddings
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import torch
-    import torch.nn.functional as F
-
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    F = openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
-
-
+if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
+else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
 logger = logging.getLogger(__name__)
-
-
 class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
     __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        temperature: float | None = None,
-        max_new_tokens: int | None = None,
-        use_default_prompt_template: bool = True,
-        use_llama2_prompt: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             _PROMPT = DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1")
             template_variables = default_formatter.extract_template_variables(_PROMPT)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-        }
-
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
     @property
-    def import_kwargs(self):
-        model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         from ..._generation import StopOnTokens
-
-        generation_kwargs = {
-            "generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
-            "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
-        }
-
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.inference_mode():
-            gen_tokens = self.model.generate(**inputs, **generation_kwargs)
-            return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-
+        generation_kwargs = {"generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), **generation_kwargs), skip_special_tokens=True, clean_up_tokenization_spaces=True)
     def embeddings(self, prompts: list[str]) -> LLMEmbeddings:
         encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
-        input_ids = encoding["input_ids"]
-        attention_mask = encoding["attention_mask"]
+        input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
         with torch.inference_mode():
-            model_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
-            data = model_outputs.hidden_states[-1]
+            data = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
             mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
             masked_embeddings = data * mask
-            sum_embeddings = torch.sum(masked_embeddings, dim=1)
-            seq_length = torch.sum(mask, dim=1)
-            embedding = sum_embeddings / seq_length
-            normalized_embeddings = F.normalize(embedding, p=2, dim=1)
-        return {
-            "embeddings": normalized_embeddings,
-            "num_tokens": torch.sum(attention_mask).item(),
-        }
+            sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
+        return {"embeddings": F.normalize(sum_embeddings / seq_length, p=2, dim=1), "num_tokens": torch.sum(attention_mask).item()}
diff --git a/src/openllm/models/mpt/__init__.py b/src/openllm/models/mpt/__init__.py
index 128f9420..8c1168cc 100644
--- a/src/openllm/models/mpt/__init__.py
+++ b/src/openllm/models/mpt/__init__.py
@@ -11,42 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_mpt"] = ["MPT"]
-
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_mpt"] = ["MPT"]
 if t.TYPE_CHECKING:
     from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_mpt import PROMPT_MAPPING as PROMPT_MAPPING
     from .configuration_mpt import START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
     from .configuration_mpt import MPTConfig as MPTConfig
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_mpt import MPT as MPT
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_mpt import MPT as MPT
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/mpt/configuration_mpt.py b/src/openllm/models/mpt/configuration_mpt.py
index 59fbd2b1..88b1cd97 100644
--- a/src/openllm/models/mpt/configuration_mpt.py
+++ b/src/openllm/models/mpt/configuration_mpt.py
@@ -11,20 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import typing as t
-
 import openllm
-
-
-if t.TYPE_CHECKING:
-    MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
-else:
-    # TODO: Support Literal string for LLMConfig
-    MPTPromptType = str
-
-
+if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
+else: MPTPromptType = str
 class MPTConfig(openllm.LLMConfig):
     """MPT is a decoder-style transformer pretrained from scratch on English text and code.
 
@@ -34,7 +25,6 @@ class MPTConfig(openllm.LLMConfig):
     on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
     for more details on specific models.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "trust_remote_code": True,
@@ -53,27 +43,12 @@ class MPTConfig(openllm.LLMConfig):
             "mosaicml/mpt-30b-chat",
         ],
     }
-
-    prompt_type: MPTPromptType = openllm.LLMConfig.Field(
-        '"default"',
-        description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""",
-    )
-
-    max_sequence_length: int = openllm.LLMConfig.Field(
-        2048,
-        description="""\
-    Max sequence length to run MPT with. Note that MPT is trained ith sequence length
-    of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096
-    (for 7b models) and 16384 (for 30b models)
-    """,
-    )
-
+    prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""")
+    max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
     class GenerationConfig:
         max_new_tokens: int = 128
         temperature: float = 0
         top_p: float = 0.8
-
-
 START_MPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for MPT model.
 
@@ -100,43 +75,16 @@ or provide `--model-id` flag when running ``openllm start mpt``:
 \b
 $ openllm start mpt --model-id mosaicml/mpt-30b
 """
-
-INSTRUCTION_KEY = "### Instruction:"
-RESPONSE_KEY = "### Response:"
-END_KEY = "### End"
-INTRO_BLURB = (
-    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-)
-
+INSTRUCTION_KEY, RESPONSE_KEY, END_KEY = "### Instruction:", "### Response:", "### End"
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
 # NOTE: This is the prompt that is used for generating responses using an already
 # trained model.  It ends with the response key, where the job of the model is to provide
 # the completion that follows it (i.e. the response itself).
-_instruct_prompt = """{intro}
+_chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instruction}""", """{intro}
 {instruction_key}
 {instruction}
 {response_key}
-""".format(
-    intro=INTRO_BLURB,
-    instruction_key=INSTRUCTION_KEY,
-    instruction="{instruction}",
-    response_key=RESPONSE_KEY,
-)
-
-_default_prompt = """{instruction}"""
-
-# TODO: XXX implement me
-_chat_prompt = """{instruction}"""
-
-PROMPT_MAPPING = {
-    "default": _default_prompt,
-    "instruct": _instruct_prompt,
-    "storywriter": _default_prompt,
-    "chat": _chat_prompt,
-}
-
-
-def _get_prompt(model_type: str) -> str:
-    return PROMPT_MAPPING[model_type]
-
-
+""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
+PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
+def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
diff --git a/src/openllm/models/mpt/modeling_mpt.py b/src/openllm/models/mpt/modeling_mpt.py
index e6473c5e..681cbce1 100644
--- a/src/openllm/models/mpt/modeling_mpt.py
+++ b/src/openllm/models/mpt/modeling_mpt.py
@@ -15,189 +15,70 @@
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
-from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE
+from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
 from ..._prompt import default_formatter
-from ...utils import generate_labels
-from ...utils import is_triton_available
-
-
-if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
-
-    from .configuration_mpt import MPTPromptType
-else:
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+from ...utils import generate_labels, is_triton_available
+if t.TYPE_CHECKING: import transformers, torch
+else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
 logger = logging.getLogger(__name__)
-
-
-def get_mpt_config(
-    model_id_or_path: str,
-    max_sequence_length: int,
-    device: torch.device | str | int | None,
-    device_map: str | None = None,
-    trust_remote_code: bool = True,
-) -> transformers.PretrainedConfig:
+def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
     config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-    if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)):
-        config.init_device = str(device)
-    if hasattr(config, "attn_config") and is_triton_available():
-        config.attn_config["attn_impl"] = "triton"
-    else:
-        logger.debug(
-            "'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
-        )
+    if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
+    if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
+    else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'")
     # setting max_seq_len
     config.max_seq_len = max_sequence_length
     return config
-
-
 class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
     __openllm_internal__ = True
-
-    def llm_post_init(self):
-        self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-
+    def llm_post_init(self): self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
     @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        model_kwds = {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}
-        tokenizer_kwds = {"padding_side": "left"}
-        return model_kwds, tokenizer_kwds
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
     def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
         _, tokenizer_attrs = self.llm_parameters
-
         torch_dtype = attrs.pop("torch_dtype", self.dtype)
         device_map = attrs.pop("device_map", None)
         attrs.pop("low_cpu_mem_usage", None)
-
-        config = get_mpt_config(
-            self.model_id,
-            self.config.max_sequence_length,
-            self.device,
-            device_map=device_map,
-            trust_remote_code=trust_remote_code,
-        )
-
+        config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
         tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-        if tokenizer.pad_token_id is None:
-            logger.warning("pad_token_id is not set. Setting it to eos_token")
-            tokenizer.pad_token = tokenizer.eos_token
-
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            self.model_id,
-            config=config,
-            torch_dtype=torch_dtype,
-            trust_remote_code=trust_remote_code,
-            device_map=device_map,
-            **attrs,
-        )
-        try:
-            return bentoml.transformers.save_model(
-                self.tag,
-                model,
-                custom_objects={"tokenizer": tokenizer},
-                labels=generate_labels(self),
-            )
-        finally:
-            torch.cuda.empty_cache()
-
+        if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
+        model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
+        try: return bentoml.transformers.save_model( self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+        finally: torch.cuda.empty_cache()
     def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
         torch_dtype = attrs.pop("torch_dtype", self.dtype)
         device_map = attrs.pop("device_map", None)
         trust_remote_code = attrs.pop("trust_remote_code", True)
-
-        _ref = bentoml.transformers.get(self.tag)
-        config = get_mpt_config(
-            _ref.path,
-            self.config.max_sequence_length,
-            self.device,
-            device_map=device_map,
-            trust_remote_code=trust_remote_code,
-        )
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            _ref.path,
-            config=config,
-            trust_remote_code=trust_remote_code,
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            **attrs,
-        )
+        config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
         model.tie_weights()
         return model
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        prompt_type: MPTPromptType | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters( self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             if prompt_type is None:
-                if "instruct" in self.model_id:
-                    prompt_type = "instruct"
-                elif "storywriter" in self.model_id:
-                    prompt_type = "storywriter"
-                elif "chat" in self.model_id:
-                    prompt_type = "chat"
-                else:
-                    prompt_type = "default"
+                if "instruct" in self.model_id: prompt_type = "instruct"
+                elif "storywriter" in self.model_id: prompt_type = "storywriter"
+                elif "chat" in self.model_id: prompt_type = "chat"
+                else: prompt_type = "default"
             _PROMPT = DEFAULT_PROMPT_TEMPLATE(prompt_type)
             template_variables = default_formatter.extract_template_variables(_PROMPT)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-        }
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
         return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        return generation_result[0]
-
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         llm_config = self.config.model_construct_env(**attrs)
-
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-
-        attrs = {
-            "do_sample": False if llm_config["temperature"] == 0 else True,
-            "eos_token_id": self.tokenizer.eos_token_id,
-            "pad_token_id": self.tokenizer.pad_token_id,
-            "generation_config": llm_config.to_generation_config(),
-        }
-
+        attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
         with torch.inference_mode():
             if torch.cuda.is_available():
                 with torch.autocast("cuda", torch.float16):
                     generated_tensors = self.model.generate(**inputs, **attrs)
-            else:
-                generated_tensors = self.model.generate(**inputs, **attrs)
-
+            else: generated_tensors = self.model.generate(**inputs, **attrs)
         return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
diff --git a/src/openllm/models/opt/__init__.py b/src/openllm/models/opt/__init__.py
index 4df559a2..d76f0dce 100644
--- a/src/openllm/models/opt/__init__.py
+++ b/src/openllm/models/opt/__init__.py
@@ -11,75 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_flax_available
 from ...utils import is_tf_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_opt"] = ["OPT"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_opt"] = ["OPT"]
 try:
-    if not is_flax_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
-
+    if not is_flax_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
 try:
-    if not is_tf_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_tf_opt"] = ["TFOPT"]
-
-
+    if not is_tf_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_tf_opt"] = ["TFOPT"]
 if t.TYPE_CHECKING:
     from .configuration_opt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_opt import START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
     from .configuration_opt import OPTConfig as OPTConfig
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_opt import OPT as OPT
-
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_opt import OPT as OPT
     try:
-        if not is_flax_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_flax_opt import FlaxOPT as FlaxOPT
-
+        if not is_flax_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_flax_opt import FlaxOPT as FlaxOPT
     try:
-        if not is_tf_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_tf_opt import TFOPT as TFOPT
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_tf_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_tf_opt import TFOPT as TFOPT
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/opt/configuration_opt.py b/src/openllm/models/opt/configuration_opt.py
index a1392a46..7abe26e5 100644
--- a/src/openllm/models/opt/configuration_opt.py
+++ b/src/openllm/models/opt/configuration_opt.py
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
-
 import openllm
-
-
 class OPTConfig(openllm.LLMConfig):
     """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
 
@@ -27,13 +23,12 @@ class OPTConfig(openllm.LLMConfig):
 
     Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "trust_remote_code": False,
         "url": "https://huggingface.co/docs/transformers/model_doc/opt",
         "default_id": "facebook/opt-1.3b",
-        "architecture": "MPTForCausalLM",
+        "architecture": "OPTForCausalLM",
         "model_ids": [
             "facebook/opt-125m",
             "facebook/opt-350m",
@@ -53,20 +48,12 @@ class OPTConfig(openllm.LLMConfig):
             },
         ),
     }
-
-    format_outputs: bool = openllm.LLMConfig.Field(
-        False,
-        description="""Whether to format the outputs. This
-    can be used when num_return_sequences > 1.""",
-    )
-
+    format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
     class GenerationConfig:
         top_k: int = 15
         temperature: float = 0.75
         max_new_tokens: int = 1024
         num_return_sequences: int = 1
-
-
 START_OPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for OPT model.
 
@@ -92,5 +79,4 @@ or provide `--model-id` flag when running ``openllm start opt``:
 \b
 $ openllm start opt --model-id facebook/opt-6.7b
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/opt/modeling_flax_opt.py b/src/openllm/models/opt/modeling_flax_opt.py
index 00e1a0f1..bc4aba57 100644
--- a/src/openllm/models/opt/modeling_flax_opt.py
+++ b/src/openllm/models/opt/modeling_flax_opt.py
@@ -11,109 +11,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
 from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
 from ...utils import generate_labels
-
-
-if t.TYPE_CHECKING:
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+if t.TYPE_CHECKING: import transformers
+else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
     __openllm_internal__ = True
-
     @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        tokenizer_kwds = {
-            "padding_side": "left",
-            "truncation_side": "left",
-        }
-        return {}, tokenizer_kwds
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
 
     def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        config = transformers.AutoConfig.from_pretrained(self.model_id)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
+        config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
         tokenizer.pad_token_id = config.pad_token_id
-        model = t.cast(
-            "transformers.FlaxOPTForCausalLM",
-            transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
-        )
-        return bentoml.transformers.save_model(
-            self.tag,
-            model,
-            custom_objects={"tokenizer": tokenizer},
-            labels=generate_labels(self),
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+        return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-            "repetition_penalty": repetition_penalty,
-        }
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
     def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        if len(generation_result) == 1:
-            if self.config.format_outputs:
-                logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
-            return generation_result[0]
-
-        if self.config.format_outputs:
-            return "Generated result:\n" + "\n -".join(generation_result)
-        else:
-            return "\n".join(generation_result)
-
-    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="np")
-        generated_tensors = self.model.generate(
-            **input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(generated_tensors.sequences, skip_special_tokens=True)
+        if len(generation_result) == 1: return generation_result[0]
+        if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+        else: return "\n".join(generation_result)
+    def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode( self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)
diff --git a/src/openllm/models/opt/modeling_opt.py b/src/openllm/models/opt/modeling_opt.py
index 5a97d347..14ca2eef 100644
--- a/src/openllm/models/opt/modeling_opt.py
+++ b/src/openllm/models/opt/modeling_opt.py
@@ -11,129 +11,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
-import bentoml
 import openllm
-
 from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-from ...utils import generate_labels
-
-
 if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
+    import torch, transformers
 else:
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+    torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
     __openllm_internal__ = True
-
-    def llm_post_init(self):
-        self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-
+    def llm_post_init(self): self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        model_kwds = {
-            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-        }
-        tokenizer_kwds = {
-            "padding_side": "left",
-            "truncation_side": "left",
-        }
-        return model_kwds, tokenizer_kwds
-
-    def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        torch_dtype = attrs.pop("torch_dtype", self.dtype)
-
-        config = transformers.AutoConfig.from_pretrained(self.model_id)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-        tokenizer.pad_token_id = config.pad_token_id
-        model = t.cast(
-            "transformers.OPTForCausalLM",
-            transformers.AutoModelForCausalLM.from_pretrained(
-                self.model_id, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, **attrs
-            ),
-        )
-        return bentoml.transformers.save_model(
-            self.tag,
-            model,
-            custom_objects={"tokenizer": tokenizer},
-            labels=generate_labels(self),
-        )
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left", "truncation_side": "left"}
     def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
         torch_dtype = attrs.pop("torch_dtype", self.dtype)
-        model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
-            bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
-        )
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, torch_dtype=torch_dtype, **attrs)
         return model
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-        }
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
     def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        if len(generation_result) == 1:
-            if self.config.format_outputs:
-                logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
-            return generation_result[0]
-
-        if self.config.format_outputs:
-            return "Generated result:\n" + "\n -".join(generation_result)
-        else:
-            return "\n".join(generation_result)
-
+        if len(generation_result) == 1: return generation_result[0]
+        if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+        else: return "\n".join(generation_result)
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        with torch.inference_mode():
-            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-            generated_tensors = self.model.generate(
-                **inputs,
-                do_sample=True,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
-            return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/src/openllm/models/opt/modeling_tf_opt.py b/src/openllm/models/opt/modeling_tf_opt.py
index cada4e3d..aa121cb5 100644
--- a/src/openllm/models/opt/modeling_tf_opt.py
+++ b/src/openllm/models/opt/modeling_tf_opt.py
@@ -11,107 +11,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
 from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
 from ...utils import generate_labels
-
-
-if t.TYPE_CHECKING:
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
+if t.TYPE_CHECKING: import transformers
+else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
     __openllm_internal__ = True
-
     @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        tokenizer_kwds = {
-            "padding_side": "left",
-            "truncation_side": "left",
-        }
-        return {}, tokenizer_kwds
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
     def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        config = transformers.AutoConfig.from_pretrained(self.model_id)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
+        config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
         tokenizer.pad_token_id = config.pad_token_id
-        model: transformers.TFOPTForCausalLM = transformers.TFOPTForCausalLM.from_pretrained(
-            self.model_id, trust_remote_code=trust_remote_code, **attrs
-        )
-        return bentoml.transformers.save_model(
-            self.tag,
-            model,
-            custom_objects={"tokenizer": tokenizer},
-            labels=generate_labels(self),
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+        return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
             template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
             prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-        }
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
     def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        if len(generation_result) == 1:
-            if self.config.format_outputs:
-                logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
-            return generation_result[0]
-
-        if self.config.format_outputs:
-            return "Generated result:\n" + "\n -".join(generation_result)
-        else:
-            return "\n".join(generation_result)
-
-    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="tf")
-        generated_tensors = self.model.generate(
-            **input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
+        if len(generation_result) == 1: return generation_result[0]
+        if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+        else: return "\n".join(generation_result)
+    def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
diff --git a/src/openllm/models/stablelm/__init__.py b/src/openllm/models/stablelm/__init__.py
index 11c86086..fbcbc5f6 100644
--- a/src/openllm/models/stablelm/__init__.py
+++ b/src/openllm/models/stablelm/__init__.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_stablelm"] = ["StableLM"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_stablelm"] = ["StableLM"]
 if t.TYPE_CHECKING:
     from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_stablelm import START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
     from .configuration_stablelm import StableLMConfig as StableLMConfig
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_stablelm import StableLM as StableLM
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_stablelm import StableLM as StableLM
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/stablelm/configuration_stablelm.py b/src/openllm/models/stablelm/configuration_stablelm.py
index c6b7ccac..aa21ca23 100644
--- a/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/src/openllm/models/stablelm/configuration_stablelm.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class StableLMConfig(openllm.LLMConfig):
     """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
 
@@ -30,7 +27,6 @@ class StableLMConfig(openllm.LLMConfig):
     and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
     for more information.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "url": "https://github.com/Stability-AI/StableLM",
@@ -43,14 +39,11 @@ class StableLMConfig(openllm.LLMConfig):
             "stabilityai/stablelm-base-alpha-7b",
         ],
     }
-
     class GenerationConfig:
         temperature: float = 0.9
         max_new_tokens: int = 128
         top_k: int = 0
         top_p: float = 0.9
-
-
 START_STABLELM_COMMAND_DOCSTRING = """\
 Run a LLMServer for StableLM model.
 
@@ -70,12 +63,10 @@ or provide `--model-id` flag when running ``openllm start stablelm``:
 \b
 $ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
-
 SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
 - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
 - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
 - StableLM will refuse to participate in anything that could harm a human.
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
diff --git a/src/openllm/models/stablelm/modeling_stablelm.py b/src/openllm/models/stablelm/modeling_stablelm.py
index b31d1768..0f8384bb 100644
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -14,91 +14,27 @@
 from __future__ import annotations
 import logging
 import typing as t
-
 import openllm
-
 from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE
 from .configuration_stablelm import SYSTEM_PROMPT
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import transformers  # noqa
-    import torch
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-
-
+if t.TYPE_CHECKING: import transformers, torch
+else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
 logger = logging.getLogger(__name__)
-
-
 class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
     __openllm_internal__ = True
-
-    def llm_post_init(self):
-        self.bettertransformer = True if not torch.cuda.is_available() else False
-
+    def llm_post_init(self): self.bettertransformer = True if not torch.cuda.is_available() else False
     @property
-    def import_kwargs(self):
-        model_kwds = {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        temperature: float | None = None,
-        max_new_tokens: int | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if "tuned" in self._model_id and use_default_prompt_template:
-            prompt_variables = {
-                k: v
-                for k, v in attrs.items()
-                if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
-            }
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
+            prompt_variables = {k: v for k, v in attrs.items() if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)}
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
             system_prompt = prompt_variables.pop("system_prompt", SYSTEM_PROMPT)
             prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, system_prompt=system_prompt)
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-        }
-
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         from ..._generation import StopOnTokens
-
-        generation_kwargs = {
-            "do_sample": True,
-            "generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
-            "pad_token_id": self.tokenizer.eos_token_id,
-            "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
-        }
-
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-
-        with torch.inference_mode():
-            if torch.cuda.is_available():
-                with torch.autocast("cuda", torch.float16):
-                    tokens = self.model.generate(**inputs, **generation_kwargs)
-            else:
-                tokens = self.model.generate(**inputs, **generation_kwargs)
-        return [self.tokenizer.decode(tokens[0], skip_special_tokens=True)]
+        with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=transformers.StoppingCriteriaList([StopOnTokens()]))[0], skip_special_tokens=True)]
diff --git a/src/openllm/models/starcoder/__init__.py b/src/openllm/models/starcoder/__init__.py
index b73c4f09..051af027 100644
--- a/src/openllm/models/starcoder/__init__.py
+++ b/src/openllm/models/starcoder/__init__.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_starcoder"] = ["StarCoder"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_starcoder"] = ["StarCoder"]
 if t.TYPE_CHECKING:
     from .configuration_starcoder import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
     from .configuration_starcoder import START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
     from .configuration_starcoder import StarCoderConfig as StarCoderConfig
-
     try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_starcoder import StarCoder as StarCoder
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_starcoder import StarCoder as StarCoder
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/openllm/models/starcoder/configuration_starcoder.py b/src/openllm/models/starcoder/configuration_starcoder.py
index 05af5dfe..7349b673 100644
--- a/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/src/openllm/models/starcoder/configuration_starcoder.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class StarCoderConfig(openllm.LLMConfig):
     """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
 
@@ -25,7 +22,6 @@ class StarCoderConfig(openllm.LLMConfig):
 
     Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
     """
-
     __config__ = {
         "name_type": "lowercase",
         "requires_gpu": True,
@@ -36,7 +32,6 @@ class StarCoderConfig(openllm.LLMConfig):
         "default_id": "bigcode/starcoder",
         "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"],
     }
-
     class GenerationConfig:
         temperature: float = 0.2
         max_new_tokens: int = 256
@@ -45,8 +40,6 @@ class StarCoderConfig(openllm.LLMConfig):
         top_p: float = 0.95
         pad_token_id: int = 49152
         repetition_penalty: float = 1.2
-
-
 START_STARCODER_COMMAND_DOCSTRING = """\
 Run a LLMServer for StarCoder model.
 
@@ -66,5 +59,4 @@ or provide `--model-id` flag when running ``openllm start starcoder``:
 \b
 $ openllm start starcoder --model-id 'bigcode/starcoder'
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 362ff5d6..93da678a 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -14,143 +14,53 @@
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
 from ...utils import generate_labels
-
-
 if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
+    import torch, transformers
 else:
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+    torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-FIM_PREFIX = "<fim-prefix>"
-FIM_MIDDLE = "<fim-middle>"
-FIM_SUFFIX = "<fim-suffix>"
-FIM_PAD = "<fim-pad>"
-EOD = "<|endoftext|>"
-FIM_INDICATOR = "<FILL_HERE>"
-
-
+FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
 class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
     __openllm_internal__ = True
-
     @property
-    def import_kwargs(self):
-        model_kwds = {
-            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-        }
-        tokenizer_kwds = {"padding_side": "left"}
-        return model_kwds, tokenizer_kwds
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
     def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        torch_dtype = attrs.pop("torch_dtype", torch.float16)
-        device_map = attrs.pop("device_map", "auto")
-
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-        tokenizer.add_special_tokens(
-            {
-                "additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
-                "pad_token": EOD,
-            }
-        )
-
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
-        )
-        try:
-            return bentoml.transformers.save_model(
-                self.tag,
-                model,
-                custom_objects={"tokenizer": tokenizer},
-                labels=generate_labels(self),
-            )
-        finally:
-            # NOTE: We need to free the cache after saving here so that we can load it back later on.
-            torch.cuda.empty_cache()
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        max_new_tokens: int | None = None,
-        repetition_penalty: float | None = None,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        fim_mode = FIM_INDICATOR in prompt
-        prefix, suffix = None, None
+        torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
+        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+        tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
+        model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
+        try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+        finally: torch.cuda.empty_cache()
+    def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+        fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
         if fim_mode:
-            try:
-                prefix, suffix = prompt.split(FIM_INDICATOR)
-            except Exception as err:
-                logger.error("Error while processing prompt with FIM mode:\n", exc_info=err)
-                raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
+            try: prefix, suffix = prompt.split(FIM_INDICATOR)
+            except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
             prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "temperature": temperature,
-            "top_p": top_p,
-            "max_new_tokens": max_new_tokens,
-            "repetition_penalty": repetition_penalty,
-            # XXX: This value is currently a hack, need more investigate why the
-            # default starcoder doesn't include the same value as santacoder EOD
-            "pad_token_id": 49152,
-            **attrs,
-        }
-
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
+        else: prompt_text = prompt
+        # XXX: This value for pad_token_id is currently a hack, need more investigate why the
+        # default starcoder doesn't include the same value as santacoder EOD
+        return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
 
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         with torch.inference_mode():
-            inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
-            result_tensor = self.model.generate(
-                inputs,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id,
-                # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+            # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
+            # NOTE: support fine-tuning starcoder
+            result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
             # TODO: We will probably want to return the tokenizer here so that we can manually process this
             # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
-            return self.tokenizer.batch_decode(
-                result_tensor[0],
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=True,
-            )
-
-    def generate_one(
-        self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
-    ) -> list[dict[t.Literal["generated_text"], str]]:
+            return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
         from ..._generation import StopSequenceCriteria
-
-        max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
-        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        src_len = encoded_inputs["input_ids"].shape[1]
-        stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
+        max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
         stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
-        outputs = self.model.generate(
-            encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
-        )
-
-        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
+        result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
         # Inference API returns the stop sequence
         for stop_seq in stop:
-            if result.endswith(stop_seq):
-                result = result[: -len(stop_seq)]
+            if result.endswith(stop_seq): result = result[: -len(stop_seq)]
         return [{"generated_text": result}]
diff --git a/src/openllm/serialisation/transformers.py b/src/openllm/serialisation/transformers.py
index 47ca2a21..ee5e86f7 100644
--- a/src/openllm/serialisation/transformers.py
+++ b/src/openllm/serialisation/transformers.py
@@ -193,6 +193,10 @@ def import_model(
     if _tokenizer.pad_token is None:
         _tokenizer.pad_token = _tokenizer.eos_token
 
+    # NOTE: quick hack to set the loaded into llm object
+    object.__setattr__(llm, "__llm_model__", model)
+    object.__setattr__(llm, "__llm_tokenizer__", _tokenizer)
+
     try:
         with bentoml.models.create(
             llm.tag,
@@ -210,9 +214,7 @@ def import_model(
             else None,
             metadata=metadata,
         ) as bentomodel:
-            save_pretrained(
-                llm, bentomodel.path, model=model, tokenizer=_tokenizer, safe_serialization=safe_serialisation
-            )
+            save_pretrained(llm, bentomodel.path, safe_serialization=safe_serialisation)
             return bentomodel
     finally:
         # NOTE: We need to free up the cache after importing the model
@@ -296,12 +298,12 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
         or getattr(model, "is_loaded_in_4bit", False)
         or getattr(model, "is_quantized", False)
     )
-    if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
+    if torch.cuda.is_available() and not loaded_in_kbit:
         try:
             model = model.to("cuda")
         except torch.cuda.OutOfMemoryError as err:
             raise RuntimeError(
-                f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
+                f"Failed to convert {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
             ) from err
     if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
         # BetterTransformer is currently only supported on PyTorch.
@@ -314,27 +316,19 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
 def save_pretrained(
     llm: openllm.LLM[M, T],
     save_directory: str,
-    model: M | None = None,
-    tokenizer: T | None = None,
     is_main_process: bool = True,
     state_dict: DictStrAny | None = None,
     save_function: t.Callable[..., None] | None = None,
     push_to_hub: bool = False,
-    max_shard_size: int | str = "10GB",
+    max_shard_size: int | str = "2GB",
     safe_serialization: bool = False,
     variant: str | None = None,
     **attrs: t.Any,
 ) -> None:
     """Light wrapper around ``transformers.PreTrainedTokenizer.save_pretrained`` and ``transformers.PreTrainedModel.save_pretrained``."""
-    model = first_not_none(model, default=llm.__llm_model__)
-    tokenizer = first_not_none(tokenizer, default=llm.__llm_tokenizer__)
     save_function = first_not_none(save_function, default=torch.save)
     model_save_attrs, tokenizer_save_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
     safe_serialization = safe_serialization or llm._serialisation_format == "safetensors"
-
-    if model is None or tokenizer is None:
-        raise RuntimeError("Failed to find loaded model or tokenizer to save to local store.")
-
     if llm._quantize_method == "gptq":
         if not is_autogptq_available():
             raise OpenLLMException(
@@ -342,11 +336,11 @@ def save_pretrained(
             )
         if llm.config["model_type"] != "causal_lm":
             raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-        model.save_quantized(save_directory, use_safetensors=safe_serialization)
-    elif isinstance(model, _transformers.Pipeline):
-        model.save_pretrained(save_directory, safe_serialization=safe_serialization)
+        llm.model.save_quantized(save_directory, use_safetensors=safe_serialization)
+    elif isinstance(llm.model, _transformers.Pipeline):
+        llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
     else:
-        model.save_pretrained(
+        llm.model.save_pretrained(
             save_directory,
             is_main_process=is_main_process,
             state_dict=state_dict,
@@ -357,4 +351,4 @@ def save_pretrained(
             variant=variant,
             **model_save_attrs,
         )
-    tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
+    llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
diff --git a/src/openllm_client/runtimes/base.py b/src/openllm_client/runtimes/base.py
index 6e3d2cb1..ecad2c87 100644
--- a/src/openllm_client/runtimes/base.py
+++ b/src/openllm_client/runtimes/base.py
@@ -90,6 +90,10 @@ class ClientMeta(t.Generic[T]):
 
     @property
     def _hf_agent(self) -> transformers.HfAgent:
+        if not self.supports_hf_agent:
+            raise openllm.exceptions.OpenLLMException(
+                f"{self.model_name} ({self.framework}) does not support running HF agent."
+            )
         if self.__agent__ is None:
             if not openllm.utils.is_transformers_supports_agent():
                 raise RuntimeError(
@@ -130,6 +134,16 @@ class ClientMeta(t.Generic[T]):
     def configuration(self) -> dict[str, t.Any]:
         raise NotImplementedError
 
+    @property
+    @abstractmethod
+    def supports_embeddings(self) -> bool:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def supports_hf_agent(self) -> bool:
+        raise NotImplementedError
+
     @property
     def llm(self) -> openllm.LLM[t.Any, t.Any]:
         if self.__llm__ is None:
diff --git a/src/openllm_client/runtimes/grpc.py b/src/openllm_client/runtimes/grpc.py
index e5b4e2d9..fa928f90 100644
--- a/src/openllm_client/runtimes/grpc.py
+++ b/src/openllm_client/runtimes/grpc.py
@@ -80,6 +80,20 @@ class GrpcClientMixin:
         except KeyError:
             raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
 
+    @property
+    def supports_embeddings(self) -> bool:
+        try:
+            return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
+    @property
+    def supports_hf_agent(self) -> bool:
+        try:
+            return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
     def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
         if isinstance(result, dict):
             return openllm.GenerationOutput(**result)
diff --git a/src/openllm_client/runtimes/http.py b/src/openllm_client/runtimes/http.py
index 7859d8ca..ade5163f 100644
--- a/src/openllm_client/runtimes/http.py
+++ b/src/openllm_client/runtimes/http.py
@@ -77,6 +77,20 @@ class HTTPClientMixin:
         except KeyError:
             raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
 
+    @property
+    def supports_embeddings(self) -> bool:
+        try:
+            return self._metadata.get("supports_embeddings", False)
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
+    @property
+    def supports_hf_agent(self) -> bool:
+        try:
+            return self._metadata.get("supports_hf_agent", False)
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
     def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput:
         return openllm.GenerationOutput(**result)
 
diff --git a/typings/attr/__init__.pyi b/typings/attr/__init__.pyi
index 4cd16215..ebc372aa 100644
--- a/typings/attr/__init__.pyi
+++ b/typings/attr/__init__.pyi
@@ -12,6 +12,7 @@ from typing import Protocol
 from typing import Sequence
 from typing import Tuple
 from typing import Type
+from typing import TypeAlias
 from typing import TypeGuard
 from typing import TypeVar
 from typing import Union
@@ -40,16 +41,16 @@ __copyright__: str
 _T = TypeVar("_T")
 _C = TypeVar("_C", bound=type)
 _P = ParamSpec("_P")
-_EqOrderType = Union[bool, Callable[[Any], Any]]
-_ValidatorType = Callable[[Any, Attribute[_T], _T], Any]
-_ConverterType = Callable[[Any], Any]
-_FilterType = Callable[[Attribute[_T], _T], bool]
-_ReprType = Callable[[Any], str]
-_ReprArgType = Union[bool, _ReprType]
-_OnSetAttrType = Callable[[Any, Attribute[Any], Any], Any]
-_OnSetAttrArgType = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
-_FieldTransformer = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
-_ValidatorArgType = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
+_EqOrderType: TypeAlias = Union[bool, Callable[[Any], Any]]
+_ValidatorType: TypeAlias = Callable[[Any, Attribute[_T], _T], Any]
+_ConverterType: TypeAlias = Callable[[Any], Any]
+_FilterType: TypeAlias = Callable[[Attribute[_T], _T], bool]
+_ReprType: TypeAlias = Callable[[Any], str]
+_ReprArgType: TypeAlias = Union[bool, _ReprType]
+_OnSetAttrType: TypeAlias = Callable[[Any, Attribute[Any], Any], Any]
+_OnSetAttrArgType: TypeAlias = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
+_FieldTransformer: TypeAlias = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
+_ValidatorArgType: TypeAlias = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
 
 class AttrsInstance(AttrsInstance_, Protocol): ...
 
@@ -535,8 +536,10 @@ def get_run_validators() -> bool: ...
 
 # aliases --
 
-s = attributes = attrs
-ib = attr = attrib
+s = attrs
+attributes = attrs
+ib = attrib
+attr = attrib
 dataclass = attrs  # Technically, partial(attrs, auto_attribs=True) ;)
 
 class ReprProtocol(Protocol):
diff --git a/typings/attr/_cmp.pyi b/typings/attr/_cmp.pyi
index 044d9f13..724d3665 100644
--- a/typings/attr/_cmp.pyi
+++ b/typings/attr/_cmp.pyi
@@ -1,8 +1,9 @@
 from typing import Any
 from typing import Callable
 from typing import Optional
+from typing import TypeAlias
 
-_CompareWithType = Callable[[Any, Any], bool]
+_CompareWithType: TypeAlias = Callable[[Any, Any], bool]
 
 def cmp_using(
     eq: Optional[_CompareWithType] = ...,
diff --git a/typings/attr/_compat.pyi b/typings/attr/_compat.pyi
index 1e97cee0..3014db28 100644
--- a/typings/attr/_compat.pyi
+++ b/typings/attr/_compat.pyi
@@ -1,5 +1,5 @@
-from typing import Any
 import threading
+from typing import Any
 
 def set_closure_cell(cell: Any, value: Any) -> None: ...
 
diff --git a/typings/attr/_make.pyi b/typings/attr/_make.pyi
index 29c1b6e0..ce62f42c 100644
--- a/typings/attr/_make.pyi
+++ b/typings/attr/_make.pyi
@@ -1,4 +1,4 @@
 from . import _CountingAttr as _CountingAttr
-from . import _make_repr as _make_repr
 from . import _make_init as _make_init
+from . import _make_repr as _make_repr
 from . import _transform_attrs as _transform_attrs
diff --git a/typings/click_option_group/_core.pyi b/typings/click_option_group/_core.pyi
index b423f216..bac052f6 100644
--- a/typings/click_option_group/_core.pyi
+++ b/typings/click_option_group/_core.pyi
@@ -15,7 +15,7 @@ import click
 
 _R = TypeVar("_R")
 _T = TypeVar("_T")
-AnyCallable = Callable[..., Any]
+AnyCallable: TypeAlias = Callable[..., Any]
 Decorator: TypeAlias = Callable[[_T], _T]
 _FC = TypeVar("_FC", bound=Union[AnyCallable, click.Command])
 
diff --git a/typings/deepmerge/merger.pyi b/typings/deepmerge/merger.pyi
index 19501cdb..6d983ab8 100644
--- a/typings/deepmerge/merger.pyi
+++ b/typings/deepmerge/merger.pyi
@@ -2,6 +2,7 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Tuple
+from typing import TypeAlias
 from typing import Union
 
 from .strategy.core import StrategyList
@@ -9,7 +10,7 @@ from .strategy.dict import DictStrategies
 from .strategy.list import ListStrategies
 from .strategy.set import SetStrategies
 
-ConfigDictType = Dict[str, Any]
+ConfigDictType: TypeAlias = Dict[str, Any]
 
 class Merger:
     PROVIDED_TYPE_STRATEGIES: Dict[type, Union[ListStrategies, DictStrategies, SetStrategies]] = ...
diff --git a/typings/deepmerge/strategy/core.pyi b/typings/deepmerge/strategy/core.pyi
index 027394d7..5d92dcf7 100644
--- a/typings/deepmerge/strategy/core.pyi
+++ b/typings/deepmerge/strategy/core.pyi
@@ -2,9 +2,10 @@ from typing import Any
 from typing import Callable
 from typing import List
 from typing import Optional
+from typing import TypeAlias
 from typing import Union
 
-_StringOrFunction = Union[str, Callable[..., Any]]
+_StringOrFunction: TypeAlias = Union[str, Callable[..., Any]]
 STRATEGY_END: object = ...
 
 class StrategyList:
diff --git a/typings/jupytext/config.pyi b/typings/jupytext/config.pyi
index 2197126b..3dea3c1b 100644
--- a/typings/jupytext/config.pyi
+++ b/typings/jupytext/config.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Generator
 from typing import Any
 from typing import Dict
-from collections.abc import Generator
 
 from _typeshed import Incomplete
 
diff --git a/typings/jupytext/formats.pyi b/typings/jupytext/formats.pyi
index 7e62ca96..c9a21334 100644
--- a/typings/jupytext/formats.pyi
+++ b/typings/jupytext/formats.pyi
@@ -1,4 +1,5 @@
 from typing import Any
+
 from _typeshed import Incomplete
 
 class JupytextFormatError(ValueError): ...