feat(service): provisional API (#133)

2026-06-11 18:09:52 -04:00 · 2023-07-23 02:15:39 -04:00
parent d88b069160
commit 693631958a
59 changed files with 683 additions and 2085 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ ci:
 exclude: '.*\.(css|js|svg)$'
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.0.278'
+    rev: 'v0.0.280'
    hooks:
      - id: ruff
        args: [--exit-non-zero-on-fix, --show-fixes]
@@ -28,6 +28,8 @@ repos:
    rev: 23.7.0
    hooks:
      - id: black-jupyter
+        args: [--config=pyproject.toml]
+        exclude: (?x)^(src/openllm/models/.*)$
  - repo: https://github.com/econchick/interrogate
    rev: 1.5.0
    hooks:
@@ -50,7 +52,6 @@ repos:
              tools/.*|
              tests/.*|
              src/openllm/playground/.*|
-              src/openllm/models/.*|
              .github/.*
          )$
        additional_dependencies: ["mypy==1.4.1", "types-tabulate", "types-Deprecated", "types-PyYAML", "types-decorator", "types-protobuf", "types-python-dateutil", "types-requests", "types-setuptools", "types-six", "types-ujson", "pandas-stubs", "types-Pillow", "types-Pygments", "types-appdirs", "types-colorama", "types-google-cloud-ndb", "types-jsonschema", "types-psutil", "types-pywin32", "types-tqdm", "types-openpyxl"]
--- a/README.md
+++ b/README.md
@@ -299,7 +299,7 @@ pip install "openllm[mpt]"
 <tr>

 <td><a href=https://huggingface.co/docs/transformers/model_doc/opt>opt</a></td>
-<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.MPTForCausalLM><code>MPTForCausalLM</code></a></td>
+<td><a href=https://huggingface.co/docs/transformers/main/model_doc/opt#transformers.OPTForCausalLM><code>OPTForCausalLM</code></a></td>
 <td>✅</td>
 <td>✅</td>
 <td>
--- a/changelog.d/133.feature.md
+++ b/changelog.d/133.feature.md
@@ -0,0 +1,14 @@
+APIs for LLMService are now provisional based on the capabilities of the LLM.
+
+The following APIs are considered provisional:
+
+- `/v1/embeddings`: This will be available if the LLM supports embeddings (i.e: ``LLM.embeddings`` is implemented. Example model are ``llama``)
+- `/hf/agent`: This will be available if LLM supports running HF agents (i.e: ``LLM.generate_one`` is implemented. Example model are ``starcoder``, ``falcon``.)
+- `POST /v1/adapters` and `GET /v1/adapters`: This will be available if the server is running with LoRA weights
+
+``openllm.LLMRunner`` now include three additional boolean:
+- `runner.supports_embeddings`: Whether this runner supports embeddings
+- `runner.supports_hf_agent`: Whether this runner support HF agents
+- `runner.has_adapters`: Whether this runner is loaded with LoRA adapters.
+
+Optimized ``openllm.models``'s bytecode performance
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,7 +157,7 @@ python_files = ["test_*.py", "*_test.py"]
 testpaths = ["tests"]

 [tool.black]
-exclude = '''
+extend-exclude = '''
 (
  /(
      \.eggs
@@ -174,14 +174,15 @@ exclude = '''
    | tools
  )/
  | src/openllm/__about__.py
+  | src/openllm/models
 )
 '''
 line-length = 119
 target-version = ["py38", "py39", "py310", "py311"]

 [tool.ruff]
-exclude = ["tools", "src/openllm/playground"]
-extend-include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
+extend-exclude = ["tools", "src/openllm/playground", "src/openllm/models", "src/openllm/_types.py"]
+extend-include = ["*.ipynb"]
 extend-select = [
    "B",    # flake8-bugbear
    "I",    # isort
@@ -223,12 +224,14 @@ ignore = [
    "TCH004",  # don't move runtime import out, just warn about it
    "RUF012",  # mutable attributes to be used with ClassVar
    "B905",    # zip warning about strict, only applicable for 3.10+
+    "D105",    # magic docstring
 ]
 line-length = 119
 target-version = "py312"
 unfixable = [
    "F401",   # Don't touch unused imports, just warn about it.
    "TCH004", # Don't touch import outside of TYPE_CHECKING block
+    "RUF100", # unused noqa, just warn about it
 ]
 [tool.ruff.flake8-type-checking]
 exempt-modules = ["typing", "typing_extensions", "."]
@@ -255,17 +258,9 @@ avoid-escape = false
 # Tests can use magic values, assertions, and relative imports
 "__init__.py" = ["E402", "F401", "F403", "F811"]
 "examples/**/*" = ["D"]
-"src/openllm/_llm.py" = ["B010", "B009"]
-"src/openllm/_strategies.py" = ["B904"]
-"src/openllm/_types.py" = ["E402"]
 "src/openllm/cli.py" = ["D301", "S101"]
-"src/openllm/models/**/*" = ["D106", "S101", "D104"]
-"src/openllm/playground/**/*" = ["E402", "F401", "PLR", "D"]
 "src/openllm/utils/dummy_*" = ["D107"]
-"src/openllm/utils/import_utils.py" = [
-    "PLW0603", # OK to ignore global access here
-    "D105",    # magic docstring
-]
+"src/openllm/utils/import_utils.py" = ["PLW0603"]
 "src/openllm_client/runtimes/*" = ["D107"]
 "tests/**/*" = [
    "S101",
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -28,6 +28,7 @@ from abc import abstractmethod
 from pathlib import Path

 import attr
+import inflection
 import orjson
 from huggingface_hub import hf_hub_download

@@ -82,6 +83,7 @@ if t.TYPE_CHECKING:
    from ._configuration import PeftType
    from ._types import AdaptersMapping
    from ._types import AdaptersTuple
+    from ._types import AnyCallable
    from ._types import DictStrAny
    from ._types import ListStr
    from ._types import LiteralRuntime
@@ -161,13 +163,12 @@ def make_tag(
            model_version = tag.version
            model_name = tag.name
        else:
-            if model_version is None:  # noqa: PLR5501
-                if not quiet:
-                    logger.warning(
-                        "Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
-                        model_id,
-                    )
-                model_version = generate_hash_from_file(model_id)
+            if not quiet and model_version is None:
+                logger.warning(
+                    "Given 'model_id=%s' is a path, and 'model_version' is not passed. OpenLLM will generate the version based on the last modified time of this given directory.",
+                    model_id,
+                )
+            model_version = first_not_none(model_version, default=generate_hash_from_file(model_id))
    else:
        config = t.cast(
            "transformers.PretrainedConfig",
@@ -418,6 +419,15 @@ class LLMInterface(ABC, t.Generic[M, T]):
    __llm_adapter_map__: dict[AdapterType, dict[str | t.Literal["default"], tuple[peft.PeftConfig, str]]] | None
    """A reference to the the cached LoRA adapter mapping."""

+    __llm_supports_embeddings__: bool
+    """A boolean to determine whether models does implement ``LLM.embeddings``."""
+    __llm_supports_generate__: bool
+    """A boolean to determine whether models does implement ``LLM.generate``."""
+    __llm_supports_generate_one__: bool
+    """A boolean to determine whether models does implement ``LLM.generate_one``."""
+    __llm_supports_generate_iterator__: bool
+    """A boolean to determine whether models does implement ``LLM.generate_iterator``."""
+
    if t.TYPE_CHECKING and not MYPY:

        def __attrs_init__(
@@ -528,6 +538,21 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]):
    return wrapper


+def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
+    # update docstring for given entrypoint
+    original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
+    original_fn.__doc__ = (
+        original_fn.__doc__
+        or f"""\
+    {cls.__name__}'s implementation for {fn}.
+
+    Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
+    The original model can then be accessed with 'self.model.get_base_model()'.
+    """
+    )
+    setattr(cls, fn, original_fn)
+
+
 def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
    attributes = {
        "import_model": _wrapped_import_model,
@@ -539,7 +564,11 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
    args: ListStr = []
    anns: DictStrAny = {}
    lines: ListStr = []
-    globs: DictStrAny = {"cls": cls, "_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface)}
+    globs: DictStrAny = {
+        "cls": cls,
+        "_cached_LLMInterface_get": _object_getattribute.__get__(LLMInterface),
+        "__gen_docstring": _update_docstring,
+    }
    # function initialisation
    for func, impl in attributes.items():
        impl_name = f"__wrapped_{func}"
@@ -561,9 +590,22 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
    interface_anns = codegen.get_annotations(LLMInterface)
    for v in {"bentomodel", "model", "tokenizer", "adapter_map"}:
        lines.append(_setattr_class(f"__llm_{v}__", None))
-        anns[f"__llm_{v}__"] = interface_anns.get("__llm_{v}__")
+        anns[f"__llm_{v}__"] = interface_anns.get(f"__llm_{v}__")

-    return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
+    # boolean to determine whether LLM has defined an implementation for a function
+    for fn in {"generate", "generate_one", "generate_iterator", "embeddings"}:
+        key = f"__llm_supports_{fn}__"
+        lines.extend(
+            [
+                _setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"),
+                f"__gen_docstring(cls, '{fn}')",
+            ]
+        )
+        anns[key] = interface_anns.get(key)
+
+    return codegen.generate_function(
+        cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns
+    )


 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"])
@@ -607,28 +649,24 @@ class LLM(LLMInterface[M, T], ReprMixin):
        implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__)
        cls.__llm_implementation__ = implementation
        config_class = openllm.AutoConfig.infer_class_from_name(config_class_name)
-
        if "__openllm_internal__" in cd:
            if "config_class" not in cd:
                cls.config_class = config_class
        elif "config_class" not in cd:
            raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
-
        _make_assignment_script(cls)(cls)

-        # update docstring for given entrypoint
-        for fn in {"generate", "generate_one", "generate_iterator"}:
-            original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
-            original_fn.__doc__ = (
-                original_fn.__doc__
-                or f"""\
-            '{fn}' implementation {cls.__name__}.
-
-            Note that if LoRA is enabled (via either SDK or CLI), `self.model` will become a `peft.PeftModel`
-            The original can then be accessed with 'self.model.get_base_model()'.
-            """
-            )
-            setattr(cls, fn, original_fn)
+    def __getitem__(self, item: t.LiteralString | t.Any) -> t.Any:
+        if item is None:
+            raise TypeError(f"{self} doesn't understand how to index None.")
+        item = inflection.underscore(item)
+        internal_attributes = f"__llm_{item}__"
+        if hasattr(self, internal_attributes):
+            return getattr(self, internal_attributes)
+        elif hasattr(self, item):
+            return getattr(self, item)
+        else:
+            raise KeyError(item)

    @classmethod
    @overload
@@ -1667,6 +1705,9 @@ def llm_runner_class(self: openllm.LLM[M, T]) -> type[LLMRunner]:
                "__repr__": ReprMixin.__repr__,
                "__repr_keys__": property(_wrapped_repr_keys),
                "__repr_args__": _wrapped_repr_args,
+                "supports_embeddings": self["supports-embeddings"],
+                "supports_hf_agent": self["supports-generate-one"],
+                "has_adapters": self._adapters_mapping is not None,
            }
        ),
    )
--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -94,6 +94,8 @@ class MetadataOutput:
    model_name: str
    framework: str
    configuration: str
+    supports_embeddings: bool
+    supports_hf_agent: bool


@attr.frozen(slots=True)
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -89,48 +89,6 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
    return openllm.GenerationOutput(responses=responses, configuration=config)


-@svc.api(
-    input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
-    output=bentoml.io.JSON.from_sample(
-        sample={
-            "embeddings": [
-                0.007917795330286026,
-                -0.014421648345887661,
-                0.00481307040899992,
-                0.007331526838243008,
-                -0.0066398633643984795,
-                0.00945580005645752,
-                0.0087016262114048,
-                -0.010709521360695362,
-                0.012635177001357079,
-                0.010541186667978764,
-                -0.00730888033285737,
-                -0.001783102168701589,
-                0.02339819073677063,
-                -0.010825827717781067,
-                -0.015888236463069916,
-                0.01876218430697918,
-                0.0076906150206923485,
-                0.0009032754460349679,
-                -0.010024012066423893,
-                0.01090280432254076,
-                -0.008668390102684498,
-                0.02070549875497818,
-                0.0014594447566196322,
-                -0.018775740638375282,
-                -0.014814382418990135,
-                0.01796768605709076,
-            ],
-            "num_tokens": 20,
-        }
-    ),
-    route="/v1/embeddings",
-)
-async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
-    responses = await runner.embeddings.async_run(phrases)
-    return openllm.EmbeddingsOutput(embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"])
-
-
@svc.api(
    input=bentoml.io.Text(),
    output=bentoml.io.JSON.from_sample(
@@ -151,42 +109,96 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
        model_name=llm_config["model_name"],
        framework=llm_config["env"]["framework_value"],
        configuration=llm_config.model_dump_json().decode(),
+        supports_embeddings=runner.supports_embeddings,
+        supports_hf_agent=runner.supports_hf_agent,
    )


-@svc.api(
-    input=bentoml.io.Text.from_sample(sample="default"),
-    output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
-    route="/v1/adapters",
-)
-async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
-    return await runner.set_adapter.async_run(adapter_name)
+if runner.supports_embeddings:
+
+    @svc.api(
+        input=bentoml.io.JSON.from_sample(sample=["Hey Jude, welcome to the jumgle!", "What is the meaning of life?"]),
+        output=bentoml.io.JSON.from_sample(
+            sample={
+                "embeddings": [
+                    0.007917795330286026,
+                    -0.014421648345887661,
+                    0.00481307040899992,
+                    0.007331526838243008,
+                    -0.0066398633643984795,
+                    0.00945580005645752,
+                    0.0087016262114048,
+                    -0.010709521360695362,
+                    0.012635177001357079,
+                    0.010541186667978764,
+                    -0.00730888033285737,
+                    -0.001783102168701589,
+                    0.02339819073677063,
+                    -0.010825827717781067,
+                    -0.015888236463069916,
+                    0.01876218430697918,
+                    0.0076906150206923485,
+                    0.0009032754460349679,
+                    -0.010024012066423893,
+                    0.01090280432254076,
+                    -0.008668390102684498,
+                    0.02070549875497818,
+                    0.0014594447566196322,
+                    -0.018775740638375282,
+                    -0.014814382418990135,
+                    0.01796768605709076,
+                ],
+                "num_tokens": 20,
+            }
+        ),
+        route="/v1/embeddings",
+    )
+    async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
+        responses = await runner.embeddings.async_run(phrases)
+        return openllm.EmbeddingsOutput(
+            embeddings=responses["embeddings"].tolist()[0], num_tokens=responses["num_tokens"]
+        )


-@attr.define
-class HfAgentInput:
-    inputs: str
-    parameters: t.Dict[str, t.Any]
+if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():

+    @attr.define
+    class HfAgentInput:
+        inputs: str
+        parameters: t.Dict[str, t.Any]

-async def hf_agent(request: Request) -> Response:
-    json_str = await request.body()
-    try:
-        input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
-    except orjson.JSONDecodeError as err:
-        raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None
+    async def hf_agent(request: Request) -> Response:
+        json_str = await request.body()
+        try:
+            input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), HfAgentInput)
+        except orjson.JSONDecodeError as err:
+            raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None

-    stop = input_data.parameters.pop("stop", ["\n"])
-    try:
-        resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
-        return JSONResponse(resp, status_code=200)
-    except NotImplementedError:
-        return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)
+        stop = input_data.parameters.pop("stop", ["\n"])
+        try:
+            resp = await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters)
+            return JSONResponse(resp, status_code=200)
+        except NotImplementedError:
+            return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)

+    hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])

-hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])])
+    svc.mount_asgi_app(hf_app, path="/hf")

-svc.mount_asgi_app(hf_app, path="/hf")
+if runner.has_adapters:
+
+    @svc.api(
+        input=bentoml.io.Text.from_sample(sample="default"),
+        output=bentoml.io.JSON.from_sample(sample={"success": True, "error_msg": "some error message"}),
+        route="/v1/adapters",
+    )
+    async def adapters_v1(adapter_name: str) -> dict[str, bool | str]:
+        return await runner.set_adapter.async_run(adapter_name)
+
+else:
+
+    async def adapters_v1(_: Request) -> Response:
+        return JSONResponse({"success": False, "message": "No available adapters for current running server"})


 async def list_adapter_v1(_: Request) -> Response:
@@ -198,5 +210,8 @@ async def list_adapter_v1(_: Request) -> Response:
    return JSONResponse(res, status_code=200)


-metadata_app = Starlette(debug=True, routes=[Route("/adapters", list_adapter_v1, methods=["GET"])])
-svc.mount_asgi_app(metadata_app, path="/v1")
+adapters_routes_v1 = [Route("/adapters", list_adapter_v1, methods=["GET"])]
+if not runner.has_adapters:
+    adapters_routes_v1.append(Route("/adapters", adapters_v1, methods=["POST"]))
+adapters_app_v1 = Starlette(debug=True, routes=adapters_routes_v1)
+svc.mount_asgi_app(adapters_app_v1, path="/v1")
--- a/src/openllm/_types.py
+++ b/src/openllm/_types.py
@@ -145,6 +145,10 @@ class LLMRunner(bentoml.Runner):
    generate_one: RunnerMethod[LLMRunnable, [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
    generate_iterator: RunnerMethod[LLMRunnable, [str], t.Generator[t.Any, None, None]]

+    supports_embeddings: bool
+    supports_hf_agent: bool
+    has_adapters: bool
+
    def __init__(
        self,
        runnable_class: type[LLMRunnable],
--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -34,6 +34,7 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
 """
 from __future__ import annotations
 import functools
+import http.client
 import importlib.machinery
 import importlib.util
 import inspect
@@ -470,9 +471,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
        return super().get_command(ctx, cmd_name)

    def list_commands(self, ctx: click.Context) -> list[str]:
-        if ctx.command.name == "start" or ctx.command.name == "start-grpc":
+        if ctx.command.name in {"start", "start-grpc"}:
            return list(openllm.CONFIG_MAPPING.keys())
-
        return super().list_commands(ctx)

    @override
@@ -883,7 +883,7 @@ def prerequisite_check(

    requirements = llm_config["requirements"]
    if requirements is not None and len(requirements) > 0:
-        missing_requirements = [i for i in requirements if importlib.util.find_spec(i) is None]
+        missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
        if len(missing_requirements) > 0:
            _echo(
                f"Make sure to have the following dependencies available: {missing_requirements}",
@@ -2339,6 +2339,11 @@ def instruct(
    """
    client = openllm.client.HTTPClient(endpoint, timeout=timeout)

+    try:
+        client.call("metadata")
+    except http.client.BadStatusLine:
+        raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
+
    if agent == "hf":
        if not is_transformers_supports_agent():
            raise click.UsageError(
--- a/src/openllm/models/baichuan/init.py
+++ b/src/openllm/models/baichuan/init.py
@@ -11,41 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_cpm_kernels_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available() or not is_cpm_kernels_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_baichuan"] = ["Baichuan"]
-
+    if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_baichuan"] = ["Baichuan"]
 if t.TYPE_CHECKING:
    from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_baichuan import START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
    from .configuration_baichuan import BaichuanConfig as BaichuanConfig

    try:
-        if not is_torch_available() or not is_cpm_kernels_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_baichuan import Baichuan as Baichuan
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_baichuan import Baichuan as Baichuan
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/baichuan/configuration_baichuan.py
+++ b/src/openllm/models/baichuan/configuration_baichuan.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class BaichuanConfig(openllm.LLMConfig):
    """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.

@@ -26,7 +23,6 @@ class BaichuanConfig(openllm.LLMConfig):
    and English benchmarks (C-Eval, MMLU, etc).
    Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "trust_remote_code": True,
@@ -45,13 +41,10 @@ class BaichuanConfig(openllm.LLMConfig):
            "hiyouga/baichuan-7b-sft",
        ],
    }
-
    class GenerationConfig:
        max_new_tokens: int = 2048
        top_p: float = 0.7
        temperature: float = 0.95
-
-
 START_BAICHUAN_COMMAND_DOCSTRING = """\
 Run a LLMServer for Baichuan model.

@@ -71,5 +64,4 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
 \b
 $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/src/openllm/models/baichuan/modeling_baichuan.py
@@ -13,69 +13,31 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
 if t.TYPE_CHECKING:
    import torch
-
    import transformers
 else:
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
 class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
    __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        top_p: float | None = None,
-        temperature: float | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
        # NOTE: The rest of attrs should be kwargs for GenerationConfig
-        generate_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "top_p": top_p,
-            "temperature": temperature,
-            **attrs,
-        }
-
+        generate_kwargs = {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}
        return prompt_text, generate_kwargs, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
-            outputs = self.model.generate(
-                **inputs,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+            outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
            return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/src/openllm/models/chatglm/init.py
+++ b/src/openllm/models/chatglm/init.py
@@ -11,41 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_cpm_kernels_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available() or not is_cpm_kernels_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_chatglm"] = ["ChatGLM"]
-
+    if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_chatglm"] = ["ChatGLM"]
 if t.TYPE_CHECKING:
    from .configuration_chatglm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_chatglm import START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
    from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
-
    try:
-        if not is_torch_available() or not is_cpm_kernels_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_chatglm import ChatGLM as ChatGLM
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_chatglm import ChatGLM as ChatGLM
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/chatglm/configuration_chatglm.py
+++ b/src/openllm/models/chatglm/configuration_chatglm.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class ChatGLMConfig(openllm.LLMConfig):
    """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.

@@ -30,7 +27,6 @@ class ChatGLMConfig(openllm.LLMConfig):

    Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "trust_remote_code": True,
@@ -48,22 +44,17 @@ class ChatGLMConfig(openllm.LLMConfig):
            "thudm/chatglm2-6b-int4",
        ],
    }
-
    retain_history: bool = openllm.LLMConfig.Field(
        False,
        description="""Whether to retain history given to the model.
        If set to True, then the model will retain given history.""",
    )
-
    use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-
    class GenerationConfig:
        max_new_tokens: int = 2048
        num_beams: int = 1
        top_p: float = 0.7
        temperature: float = 0.95
-
-
 START_CHATGLM_COMMAND_DOCSTRING = """\
 Run a LLMServer for ChatGLM model.

@@ -83,5 +74,4 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
 \b
 $ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -13,94 +13,34 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
-import bentoml
 import openllm
-
-from ...utils import generate_labels
-
-
 if t.TYPE_CHECKING:
    import torch
-
    import transformers
 else:
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
 class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
    __openllm_internal__ = True
-
-    def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        return bentoml.transformers.save_model(
-            self.tag,
-            transformers.AutoModel.from_pretrained(self.model_id, trust_remote_code=trust_remote_code),
-            labels=generate_labels(self),
-            custom_objects={
-                "tokenizer": transformers.AutoTokenizer.from_pretrained(
-                    self.model_id, trust_remote_code=trust_remote_code, **tokenizer_attrs
-                )
-            },
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        num_beams: int | None = None,
-        top_p: float | None = None,
-        temperature: float | None = None,
-        chat_history: list[str] | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[str] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        prompt_text = ""
-
        if use_default_prompt_template and chat_history is not None:
-            for i, (old_query, response) in enumerate(chat_history):
-                prompt_text += f"[Round {i}]\n问：{old_query}\n答：{response}\n"  # noqa: RUF001
+            for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问：{old_query}\n答：{response}\n"  # noqa: RUF001
            prompt_text += f"[Round {len(chat_history)}]\n问：{prompt}\n答："  # noqa: RUF001
-        else:
-            prompt_text = prompt
-
+        else: prompt_text = prompt
        postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
-
        # NOTE: The rest of attrs should be kwargs for GenerationConfig
-        generate_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "num_beams": num_beams,
-            "top_p": top_p,
-            "temperature": temperature,
-            **attrs,
-        }
-
+        generate_kwargs = {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}
        return prompt_text, generate_kwargs, postprocess_generate_kwargs
-
-    def postprocess_generate(
-        self,
-        prompt: str,
-        generation_result: tuple[str, list[tuple[str, str]]],
-        *,
-        chat_history: list[tuple[str, str]] | None = None,
-        **attrs: t.Any,
-    ):
+    def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any):
        generated, history = generation_result
        if self.config.retain_history:
            assert chat_history is not None, "'retain_history' is True while there is no history provided."
            chat_history.extend(history)
        return generated
-
    def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
        with torch.inference_mode():
            self.model.eval()
            # Only use half precision if the model is not yet quantized
-            if self.config.use_half_precision:
-                self.model.half()
-            return self.model.chat(
-                self.tokenizer,
-                prompt,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+            if self.config.use_half_precision: self.model.half()
+            return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
--- a/src/openllm/models/dolly_v2/init.py
+++ b/src/openllm/models/dolly_v2/init.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_dolly_v2"] = ["DollyV2"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_dolly_v2"] = ["DollyV2"]
 if t.TYPE_CHECKING:
    from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_dolly_v2 import START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
    from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_dolly_v2 import DollyV2 as DollyV2
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_dolly_v2 import DollyV2 as DollyV2
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -13,14 +13,8 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
-
-if t.TYPE_CHECKING:
-    from transformers import PreTrainedTokenizer
-
-
+if t.TYPE_CHECKING: import transformers
 class DollyV2Config(openllm.LLMConfig):
    """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.

@@ -33,7 +27,6 @@ class DollyV2Config(openllm.LLMConfig):

    Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
    """
-
    __config__ = {
        "timeout": 3600000,
        "url": "https://github.com/databrickslabs/dolly",
@@ -41,19 +34,15 @@ class DollyV2Config(openllm.LLMConfig):
        "default_id": "databricks/dolly-v2-3b",
        "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"],
    }
-
    return_full_text: bool = openllm.LLMConfig.Field(
        False, description="Whether to return the full prompt to the users."
    )
-
    class GenerationConfig:
        temperature: float = 0.9
        top_p: float = 0.92
        top_k: int = 5
        max_new_tokens: int = 256
        eos_token_id: int = 50277  # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
-
-
 START_DOLLY_V2_COMMAND_DOCSTRING = """\
 Run a LLMServer for dolly-v2 model.

@@ -73,14 +62,10 @@ or provide `--model-id` flag when running ``openllm start dolly-v2``:
 \b
 $ openllm start dolly-v2 --model-id databricks/dolly-v2-7b
 """
-
 INSTRUCTION_KEY = "### Instruction:"
 RESPONSE_KEY = "### Response:"
 END_KEY = "### End"
-INTRO_BLURB = (
-    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-)
-
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
 # NOTE: This is the prompt that is used for generating responses using an already
 # trained model.  It ends with the response key, where the job of the model is to provide
 # the completion that follows it (i.e. the response itself).
@@ -88,15 +73,8 @@ DEFAULT_PROMPT_TEMPLATE = """{intro}
 {instruction_key}
 {instruction}
 {response_key}
-""".format(
-    intro=INTRO_BLURB,
-    instruction_key=INSTRUCTION_KEY,
-    instruction="{instruction}",
-    response_key=RESPONSE_KEY,
-)
-
-
-def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
+""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
+def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.

    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
@@ -113,6 +91,5 @@ def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
        int: the token ID for the given key.
    """
    token_ids = tokenizer.encode(key)
-    if len(token_ids) > 1:
-        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
+    if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -15,288 +15,118 @@ from __future__ import annotations
 import logging
 import re
 import typing as t
-
 import openllm
-
 from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 from .configuration_dolly_v2 import END_KEY
 from .configuration_dolly_v2 import RESPONSE_KEY
 from .configuration_dolly_v2 import get_special_token_id
-
-
 if t.TYPE_CHECKING:
-    import tensorflow as tf
    import torch
-
    import transformers
+    import tensorflow as tf
 else:
    tf = openllm.utils.LazyLoader("tf", globals(), "tensorflow")
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
 logger = logging.getLogger(__name__)
-
-
@t.overload
-def get_pipeline(
-    model: transformers.PreTrainedModel,
-    tokenizer: transformers.PreTrainedTokenizer,
-    _init: t.Literal[True] = True,
-    **attrs: t.Any,
-) -> transformers.Pipeline:
-    ...
-
-
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ...
@t.overload
-def get_pipeline(
-    model: transformers.PreTrainedModel,
-    tokenizer: transformers.PreTrainedTokenizer,
-    _init: t.Literal[False] = ...,
-    **attrs: t.Any,
-) -> type[transformers.Pipeline]:
-    ...
-
-
-def get_pipeline(
-    model: transformers.PreTrainedModel,
-    tokenizer: transformers.PreTrainedTokenizer,
-    _init: bool = False,
-    **attrs: t.Any,
-) -> type[transformers.Pipeline] | transformers.Pipeline:
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ...
+def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
    class InstructionTextGenerationPipeline(transformers.Pipeline):
-        def __init__(
-            self,
-            *args: t.Any,
-            do_sample: bool = True,
-            max_new_tokens: int = 256,
-            top_p: float = 0.92,
-            top_k: int = 0,
-            **kwargs: t.Any,
-        ):
-            """Initialize the pipeline.
-
-            Args:
-                do_sample: Whether or not to use sampling. Defaults to True.
-                max_new_tokens: Max new tokens after the prompt to generate. Defaults to 128.
-                top_p: If set to float < 1, only the smallest set of most probable tokens with
-                       probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
-                top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to 0.
-                *args: Additional positional arguments to be passed to ``transformers.Pipeline``.
-                **kwargs: Additional keyword arguments to be passed to ``transformers.Pipeline``.
-            """
-            super().__init__(
-                *args,
-                model=model,
-                tokenizer=tokenizer,
-                do_sample=do_sample,
-                max_new_tokens=max_new_tokens,
-                top_p=top_p,
-                top_k=top_k,
-                **kwargs,
-            )
-
+        def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)
        def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
            preprocess_params: dict[str, t.Any] = {}
-
            # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
            # append a newline to yield a single token.  find whatever token is configured for the response key.
-            tokenizer_response_key = next(
-                (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
-            )
-
+            tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
            response_key_token_id = None
            end_key_token_id = None
            if tokenizer_response_key:
                try:
                    response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                    end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
-
                    # Ensure generation stops once it generates "### End"
                    generate_kwargs["eos_token_id"] = end_key_token_id
-                except ValueError:
-                    pass
-
+                except ValueError: pass
            forward_params = generate_kwargs
            postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id}
-
-            if return_full_text is not None:
-                postprocess_params["return_full_text"] = return_full_text
-
+            if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text
            return preprocess_params, forward_params, postprocess_params
-
        def preprocess(self, input_: str, **generate_kwargs: t.Any):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
            inputs = self.tokenizer(prompt_text, return_tensors="pt")
            inputs["prompt_text"] = prompt_text
            inputs["instruction_text"] = input_
            return inputs
-
        def _forward(self, model_inputs: dict[str, t.Any], **generate_kwargs: t.Any):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
-            input_ids = model_inputs["input_ids"]
-            attention_mask = model_inputs.get("attention_mask", None)
-
-            if input_ids.shape[1] == 0:
-                input_ids = None
-                attention_mask = None
-                in_b = 1
-            else:
-                in_b = input_ids.shape[0]
-
-            generated_sequence = self.model.generate(
-                input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
-                attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
-                pad_token_id=self.tokenizer.pad_token_id,
-                **generate_kwargs,
-            )
-
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
+            input_ids, attention_mask = model_inputs["input_ids"], model_inputs.get("attention_mask", None)
+            if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
+            else: in_b = input_ids.shape[0]
+            generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs)
            out_b = generated_sequence.shape[0]
-            if self.framework == "pt":
-                generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
-            elif self.framework == "tf":
-                generated_sequence = tf.reshape(
-                    generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])
-                )
-
+            if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+            elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
            instruction_text = model_inputs.pop("instruction_text")
-            return {
-                "generated_sequence": generated_sequence,
-                "input_ids": input_ids,
-                "instruction_text": instruction_text,
-            }
-
-        def postprocess(
-            self,
-            model_outputs: dict[str, t.Any],
-            response_key_token_id: int,
-            end_key_token_id: int,
-            return_full_text: bool = False,
-        ):
-            if t.TYPE_CHECKING:
-                assert self.tokenizer is not None
-            generated_sequence = model_outputs["generated_sequence"][0]
-            instruction_text = model_outputs["instruction_text"]
+            return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

+        def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False):
+            if t.TYPE_CHECKING: assert self.tokenizer is not None
+            generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"]
            generated_sequence: list[list[int]] = generated_sequence.numpy().tolist()
            records: list[dict[t.Literal["generated_text"], str]] = []
            for sequence in generated_sequence:
                # The response will be set to this variable if we can identify it.
                decoded = None
-
                # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
                if response_key_token_id and end_key_token_id:
                    # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
                    # prompt, we should definitely find it.  We will return the tokens found after this token.
-                    try:
-                        response_pos = sequence.index(response_key_token_id)
-                    except ValueError:
-                        logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
-                        response_pos = None
-
+                    try: response_pos = sequence.index(response_key_token_id)
+                    except ValueError: response_pos = None
+                    if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
                    if response_pos:
                        # Next find where "### End" is located.  The model has been trained to end its responses with this
                        # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                        # this token, as the response could be truncated.  If we don't find it then just return everything
                        # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
-                        try:
-                            end_pos = sequence.index(end_key_token_id)
-                        except ValueError:
-                            end_pos = None
-
+                        try: end_pos = sequence.index(end_key_token_id)
+                        except ValueError: end_pos = None
                        decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
-
                if not decoded:
                    # Otherwise we'll decode everything and use a regex to find the response and end.
-
                    fully_decoded = self.tokenizer.decode(sequence)
-
                    # The response appears after "### Response:".  The model has been trained to append "### End" at the
                    # end.
                    m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
-
-                    if m:
-                        decoded = m.group(1).strip()
+                    if m: decoded = m.group(1).strip()
                    else:
                        # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                        # return everything after "### Response:".
                        m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
-                        if m:
-                            decoded = m.group(1).strip()
-                        else:
-                            logger.warning("Failed to find response in:\n%s", fully_decoded)
-
+                        if m: decoded = m.group(1).strip()
+                        else: logger.warning("Failed to find response in:\n%s", fully_decoded)
                # If the full text is requested, then append the decoded text to the original instruction.
                # This technically isn't the full text, as we format the instruction in the prompt the model has been
                # trained on, but to the client it will appear to be the full text.
-                if return_full_text:
-                    decoded = f"{instruction_text}\n{decoded}"
-
+                if return_full_text: decoded = f"{instruction_text}\n{decoded}"
                rec = {"generated_text": decoded}
-
                records.append(rec)
-
            return records

-    if _init:
-        return InstructionTextGenerationPipeline()
+    if _init: return InstructionTextGenerationPipeline()
    return InstructionTextGenerationPipeline
-
-
 class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]):
    __openllm_internal__ = True
-
    @property
-    def import_kwargs(self):
-        model_kwds = {
-            "device_map": "auto" if torch.cuda.is_available() else None,
-            "torch_dtype": torch.bfloat16,
-        }
-        tokenizer_kwds = {"padding_side": "left"}
-        return model_kwds, tokenizer_kwds
-
-    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
-        return get_pipeline(
-            model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
-            tokenizer=self.tokenizer,
-            _init=True,
-            return_full_text=self.config.return_full_text,
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        # NOTE: The rest of attrs should be kwargs for GenerationConfig
-        generate_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "top_k": top_k,
-            "top_p": top_p,
-            "temperature": temperature,
-            **attrs,
-        }
-
-        return prompt, generate_kwargs, {}
-
-    def postprocess_generate(
-        self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any
-    ) -> str:
-        return generation_result[0]["generated_text"]
-
+    def import_kwargs(self): return {"device_map": "auto" if torch.cuda.is_available() else None, "torch_dtype": torch.bfloat16}, {"padding_side": "left"}
+    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(model=transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), tokenizer=self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return prompt, {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
    def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
-        with torch.inference_mode():
-            llm_config = self.config.model_construct_env(**attrs)
-            return self.model(
-                prompt,
-                return_full_text=llm_config.return_full_text,
-                generation_config=llm_config.to_generation_config(),
-            )
+        llm_config = self.config.model_construct_env(**attrs)
+        with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/src/openllm/models/falcon/init.py
+++ b/src/openllm/models/falcon/init.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_falcon"] = ["Falcon"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_falcon"] = ["Falcon"]
 if t.TYPE_CHECKING:
    from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_falcon import START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
    from .configuration_falcon import FalconConfig as FalconConfig
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_falcon import Falcon as Falcon
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_falcon import Falcon as Falcon
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class FalconConfig(openllm.LLMConfig):
    """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.

@@ -23,7 +20,6 @@ class FalconConfig(openllm.LLMConfig):

    Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "trust_remote_code": True,
@@ -50,15 +46,12 @@ class FalconConfig(openllm.LLMConfig):
            },
        ),
    }
-
    class GenerationConfig:
        max_new_tokens: int = 200
        top_k: int = 10
        num_return_sequences: int = 1
        num_beams: int = 4
        early_stopping: bool = True
-
-
 START_FALCON_COMMAND_DOCSTRING = """\
 Run a LLMServer for FalconLM model.

@@ -78,7 +71,6 @@ or provide `--model-id` flag when running ``openllm start falcon``:
 \b
 $ openllm start falcon --model-id tiiuae/falcon-7b-instruct
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{context}
 {user_name}: {instruction}
 {agent}:
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -11,105 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
 if t.TYPE_CHECKING:
    import torch
-
    import transformers
 else:
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
 class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
    __openllm_internal__ = True
-
    @property
-    def import_kwargs(self):
-        model_kwds = {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        eos_token_id: int | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() else None}, {}
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument instead of "
-                    "kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-            "eos_token_id": eos_token_id,
-            **attrs,
-        }
-
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        eos_token_id = attrs.pop("eos_token_id", self.tokenizer.eos_token_id)
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
-            outputs = self.model.generate(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                generation_config=self.config.model_construct_env(
-                    eos_token_id=eos_token_id, **attrs
-                ).to_generation_config(),
-            )
-            return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-    def generate_one(
-        self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
-    ) -> list[dict[t.Literal["generated_text"], str]]:
+        eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env( eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True)
+    def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
        from ..._generation import StopSequenceCriteria
-
-        max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
-        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        src_len = encoded_inputs["input_ids"].shape[1]
-        stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
+        max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
        stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
-        outputs = self.model.generate(
-            encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
-        )
-
-        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
+        result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
        # Inference API returns the stop sequence
        for stop_seq in stop:
-            if result.endswith(stop_seq):
-                result = result[: -len(stop_seq)]
+            if result.endswith(stop_seq): result = result[: -len(stop_seq)]
        return [{"generated_text": result}]
--- a/src/openllm/models/flan_t5/init.py
+++ b/src/openllm/models/flan_t5/init.py
@@ -13,73 +13,40 @@
 # limitations under the License.

 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_flax_available
 from ...utils import is_tf_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_flan_t5": ["FlanT5Config", "START_FLAN_T5_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_flan_t5"] = ["FlanT5"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_flan_t5"] = ["FlanT5"]
 try:
-    if not is_flax_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
-
+    if not is_flax_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
 try:
-    if not is_tf_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
-
-
+    if not is_tf_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
 if t.TYPE_CHECKING:
    from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_flan_t5 import START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
    from .configuration_flan_t5 import FlanT5Config as FlanT5Config
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_flan_t5 import FlanT5 as FlanT5
-
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_flan_t5 import FlanT5 as FlanT5
    try:
-        if not is_flax_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
-
+        if not is_flax_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
    try:
-        if not is_tf_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_tf_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/flan_t5/configuration_flan_t5.py
+++ b/src/openllm/models/flan_t5/configuration_flan_t5.py
@@ -12,10 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
+class FlanT5Config(openllm.LLMConfig):
+    """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).

+    It is an enhanced version of T5 that has been finetuned in a mixture of tasks.

+    Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
+    """
+    __config__ = {
+        "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
+        "default_id": "google/flan-t5-large",
+        "architecture": "T5ForConditionalGeneration",
+        "model_ids": [
+            "google/flan-t5-small",
+            "google/flan-t5-base",
+            "google/flan-t5-large",
+            "google/flan-t5-xl",
+            "google/flan-t5-xxl",
+        ],
+        "model_type": "seq2seq_lm",
+    }
+    class GenerationConfig:
+        temperature: float = 0.9
+        max_new_tokens: int = 2048
+        top_k: int = 50
+        top_p: float = 0.4
+        repetition_penalty = 1.0
 START_FLAN_T5_COMMAND_DOCSTRING = """\
 Run a LLMServer for FLAN-T5 model.

@@ -41,35 +64,4 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
 \b
 $ openllm start flan-t5 --model-id google/flan-t5-xxl
 """
-
 DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
-
-
-class FlanT5Config(openllm.LLMConfig):
-    """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
-
-    It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
-
-    Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
-    """
-
-    __config__ = {
-        "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5",
-        "default_id": "google/flan-t5-large",
-        "architecture": "T5ForConditionalGeneration",
-        "model_ids": [
-            "google/flan-t5-small",
-            "google/flan-t5-base",
-            "google/flan-t5-large",
-            "google/flan-t5-xl",
-            "google/flan-t5-xxl",
-        ],
-        "model_type": "seq2seq_lm",
-    }
-
-    class GenerationConfig:
-        temperature: float = 0.9
-        max_new_tokens: int = 2048
-        top_k: int = 50
-        top_p: float = 0.4
-        repetition_penalty = 1.0
--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -13,71 +13,25 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
 if t.TYPE_CHECKING:
    import torch
-
    import transformers  # noqa: F401
 else:
    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-
-
 class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
    __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-        }
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        with torch.inference_mode():
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
-            result_tensor = self.model.generate(
-                input_ids,
-                do_sample=True,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
-            return self.tokenizer.batch_decode(result_tensor, skip_special_tokens=True)
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -26,64 +26,18 @@ if t.TYPE_CHECKING:

 class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
    __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        repetition_penalty: float | None = None,
-        decoder_start_token_id: int | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        if decoder_start_token_id is None:
-            decoder_start_token_id = 0
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-            "decoder_start_token_id": decoder_start_token_id,
-        }
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        if decoder_start_token_id is None: decoder_start_token_id = 0
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        # XXX: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main
-        # as it is required for encoder-decoder generation.
+        # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
        decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
-        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
-        result_tensor = self.model.generate(
-            input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            decoder_start_token_id=decoder_start_token_id,
-        )
-        return self.tokenizer.batch_decode(
-            result_tensor.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
+        return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)
--- a/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -13,66 +13,20 @@
 # limitations under the License.
 from __future__ import annotations
 import typing as t
-
 import openllm
-
 from .configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import transformers  # noqa: F401
-
-
+if t.TYPE_CHECKING: import transformers  # noqa: F401
 class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
    __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-        }
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
-
-    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="tf").input_ids
-        outputs = self.model.generate(
-            input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
+    def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/src/openllm/models/gpt_neox/init.py
+++ b/src/openllm/models/gpt_neox/init.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_gpt_neox": ["GPTNeoXConfig", "START_GPT_NEOX_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
 if t.TYPE_CHECKING:
    from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_gpt_neox import START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
    from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_gpt_neox import GPTNeoX as GPTNeoX
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/configuration_gpt_neox.py
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
-
 import openllm
-
-
 class GPTNeoXConfig(openllm.LLMConfig):
    """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.

@@ -32,7 +28,6 @@ class GPTNeoXConfig(openllm.LLMConfig):
    Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
    for more information.
    """
-
    __config__ = {
        "model_name": "gpt_neox",
        "start_name": "gpt-neox",
@@ -42,14 +37,10 @@ class GPTNeoXConfig(openllm.LLMConfig):
        "default_id": "eleutherai/gpt-neox-20b",
        "model_ids": ["eleutherai/gpt-neox-20b"],
    }
-
    use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
-
    class GenerationConfig:
        temperature: float = 0.9
        max_new_tokens: int = 100
-
-
 START_GPT_NEOX_COMMAND_DOCSTRING = """\
 Run a LLMServer for GPTNeoX model.

@@ -69,6 +60,4 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
 \b
 $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
-
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -11,88 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import openllm
-
 from .configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-
-
+if t.TYPE_CHECKING: import torch, transformers
+else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
    __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        temperature: float | None = None,
-        max_new_tokens: int | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature}
-
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
    @property
-    def import_kwargs(self):
-        model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
-        if self.config.use_half_precision:
-            model.half()
+        if self.config.use_half_precision: model.half()
        return model
-
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
        from ..._generation import StopOnTokens
-
-        generation_kwargs = {
-            "do_sample": True,
-            "generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
-            "pad_token_id": self.tokenizer.eos_token_id,
-            "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
-        }
-
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.inference_mode():
-            gen_tokens = self.model.generate(inputs.input_ids, **generation_kwargs)
-            return self.tokenizer.batch_decode(gen_tokens)
+        generation_kwargs = {"do_sample": True, "generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "pad_token_id": self.tokenizer.eos_token_id, "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, **generation_kwargs))
--- a/src/openllm/models/llama/init.py
+++ b/src/openllm/models/llama/init.py
@@ -11,64 +11,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
 from ...utils import is_vllm_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_llama": [
-        "LlaMAConfig",
-        "START_LLAMA_COMMAND_DOCSTRING",
-        "DEFAULT_PROMPT_TEMPLATE",
-        "PROMPT_MAPPING",
-    ],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_llama": ["LlaMAConfig", "START_LLAMA_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
 try:
-    if not is_vllm_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
-
+    if not is_vllm_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_vllm_llama"] = ["VLLMLlaMA"]
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_llama"] = ["LlaMA"]
-
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_llama"] = ["LlaMA"]
 if t.TYPE_CHECKING:
    from .configuration_llama import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_llama import PROMPT_MAPPING as PROMPT_MAPPING
    from .configuration_llama import START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
    from .configuration_llama import LlaMAConfig as LlaMAConfig
-
    try:
-        if not is_vllm_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
-
+        if not is_vllm_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_vllm_llama import VLLMLlaMA as VLLMLlaMA
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_llama import LlaMA as LlaMA
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_llama import LlaMA as LlaMA
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/llama/configuration_llama.py
+++ b/src/openllm/models/llama/configuration_llama.py
@@ -11,13 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import typing as t
-
 import openllm
-
-
 class LlaMAConfig(openllm.LLMConfig):
    """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.

@@ -30,11 +26,7 @@ class LlaMAConfig(openllm.LLMConfig):
    Refer to [LlaMA's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
    for more information.
    """
-
-    use_llama2_prompt: bool = openllm.LLMConfig.Field(
-        True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1."
-    )
-
+    use_llama2_prompt: bool = openllm.LLMConfig.Field(True, description="Whether to use the prompt format for LlaMA 2. Disable this when working with LlaMA 1.")
    __config__ = {
        "model_name": "llama",
        "start_name": "llama",
@@ -69,18 +61,14 @@ class LlaMAConfig(openllm.LLMConfig):
            },
        ),
    }
-
    class GenerationConfig:
        max_new_tokens: int = 256
        temperature: float = 0.45
        top_p: float = 0.95
        top_k: int = 12
-
    class SamplingParams:
        best_of: int = 1
        presence_penalty: float = 0.5
-
-
 START_LLAMA_COMMAND_DOCSTRING = """\
 Run a LLMServer for LlaMA model.

@@ -110,39 +98,14 @@ OpenLLM also supports running LlaMA-2 and its fine-tune and variants. To import
 \b
 $ CONVERTER=hf-llama2 openllm import llama /path/to/llama-2
 """
-
 SYSTEM_MESSAGE = """
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
 """
-
-SINST_KEY = "[INST]"
-EINST_KEY = "[/INST]"
-SYS_KEY = "<<SYS>>"
-EOS_TOKEN = "</s>"
-BOS_TOKEN = "<s>"
-
-# TODO: support history
-_v2_prompt = """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(
-    start_key=SINST_KEY,
-    sys_key=SYS_KEY,
-    system_message=SYSTEM_MESSAGE,
-    instruction="{instruction}",
-    end_key=EINST_KEY,
-)
-
-# XXX: implement me
-_v1_prompt = """{instruction}"""
-
-PROMPT_MAPPING = {
-    "v1": _v1_prompt,
-    "v2": _v2_prompt,
-}
-
-
-def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str:
-    return PROMPT_MAPPING[model_type]
-
-
+SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = "[INST]", "[/INST]", "<<SYS>>", "</s>", "<s>"
+# TODO: support history and v1 prompt implementation
+_v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} """.format(start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction="{instruction}", end_key=EINST_KEY)
+PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
+def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
--- a/src/openllm/models/llama/modeling_llama.py
+++ b/src/openllm/models/llama/modeling_llama.py
@@ -11,110 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import openllm
-
 from .configuration_llama import DEFAULT_PROMPT_TEMPLATE
 from ..._llm import LLMEmbeddings
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import torch
-    import torch.nn.functional as F
-
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    F = openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
-
-
+if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
+else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
 logger = logging.getLogger(__name__)
-
-
 class LlaMA(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
    __openllm_internal__ = True
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        temperature: float | None = None,
-        max_new_tokens: int | None = None,
-        use_default_prompt_template: bool = True,
-        use_llama2_prompt: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            _PROMPT = DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1")
            template_variables = default_formatter.extract_template_variables(_PROMPT)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-        }
-
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
    @property
-    def import_kwargs(self):
-        model_kwds = {"device_map": "auto" if torch.cuda.device_count() > 1 else None}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.device_count() > 1 else None}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
        from ..._generation import StopOnTokens
-
-        generation_kwargs = {
-            "generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
-            "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
-        }
-
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.inference_mode():
-            gen_tokens = self.model.generate(**inputs, **generation_kwargs)
-            return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-
+        generation_kwargs = {"generation_config": self.config.model_construct_env(**attrs).to_generation_config(), "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()])}
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), **generation_kwargs), skip_special_tokens=True, clean_up_tokenization_spaces=True)
    def embeddings(self, prompts: list[str]) -> LLMEmbeddings:
        encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
-        input_ids = encoding["input_ids"]
-        attention_mask = encoding["attention_mask"]
+        input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
        with torch.inference_mode():
-            model_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
-            data = model_outputs.hidden_states[-1]
+            data = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
            mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
            masked_embeddings = data * mask
-            sum_embeddings = torch.sum(masked_embeddings, dim=1)
-            seq_length = torch.sum(mask, dim=1)
-            embedding = sum_embeddings / seq_length
-            normalized_embeddings = F.normalize(embedding, p=2, dim=1)
-        return {
-            "embeddings": normalized_embeddings,
-            "num_tokens": torch.sum(attention_mask).item(),
-        }
+            sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
+        return {"embeddings": F.normalize(sum_embeddings / seq_length, p=2, dim=1), "num_tokens": torch.sum(attention_mask).item()}
--- a/src/openllm/models/mpt/init.py
+++ b/src/openllm/models/mpt/init.py
@@ -11,42 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_mpt": ["MPTConfig", "START_MPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE", "PROMPT_MAPPING"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_mpt"] = ["MPT"]
-
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_mpt"] = ["MPT"]
 if t.TYPE_CHECKING:
    from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_mpt import PROMPT_MAPPING as PROMPT_MAPPING
    from .configuration_mpt import START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
    from .configuration_mpt import MPTConfig as MPTConfig
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_mpt import MPT as MPT
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_mpt import MPT as MPT
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/mpt/configuration_mpt.py
+++ b/src/openllm/models/mpt/configuration_mpt.py
@@ -11,20 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import typing as t
-
 import openllm
-
-
-if t.TYPE_CHECKING:
-    MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
-else:
-    # TODO: Support Literal string for LLMConfig
-    MPTPromptType = str
-
-
+if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
+else: MPTPromptType = str
 class MPTConfig(openllm.LLMConfig):
    """MPT is a decoder-style transformer pretrained from scratch on English text and code.

@@ -34,7 +25,6 @@ class MPTConfig(openllm.LLMConfig):
    on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
    for more details on specific models.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "trust_remote_code": True,
@@ -53,27 +43,12 @@ class MPTConfig(openllm.LLMConfig):
            "mosaicml/mpt-30b-chat",
        ],
    }
-
-    prompt_type: MPTPromptType = openllm.LLMConfig.Field(
-        '"default"',
-        description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""",
-    )
-
-    max_sequence_length: int = openllm.LLMConfig.Field(
-        2048,
-        description="""\
-    Max sequence length to run MPT with. Note that MPT is trained ith sequence length
-    of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096
-    (for 7b models) and 16384 (for 30b models)
-    """,
-    )
-
+    prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="""Given prompt type for running MPT. Default will be inferred from model name if pretrained.""")
+    max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
    class GenerationConfig:
        max_new_tokens: int = 128
        temperature: float = 0
        top_p: float = 0.8
-
-
 START_MPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for MPT model.

@@ -100,43 +75,16 @@ or provide `--model-id` flag when running ``openllm start mpt``:
 \b
 $ openllm start mpt --model-id mosaicml/mpt-30b
 """
-
-INSTRUCTION_KEY = "### Instruction:"
-RESPONSE_KEY = "### Response:"
-END_KEY = "### End"
-INTRO_BLURB = (
-    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-)
-
+INSTRUCTION_KEY, RESPONSE_KEY, END_KEY = "### Instruction:", "### Response:", "### End"
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
 # NOTE: This is the prompt that is used for generating responses using an already
 # trained model.  It ends with the response key, where the job of the model is to provide
 # the completion that follows it (i.e. the response itself).
-_instruct_prompt = """{intro}
+_chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instruction}""", """{intro}
 {instruction_key}
 {instruction}
 {response_key}
-""".format(
-    intro=INTRO_BLURB,
-    instruction_key=INSTRUCTION_KEY,
-    instruction="{instruction}",
-    response_key=RESPONSE_KEY,
-)
-
-_default_prompt = """{instruction}"""
-
-# TODO: XXX implement me
-_chat_prompt = """{instruction}"""
-
-PROMPT_MAPPING = {
-    "default": _default_prompt,
-    "instruct": _instruct_prompt,
-    "storywriter": _default_prompt,
-    "chat": _chat_prompt,
-}
-
-
-def _get_prompt(model_type: str) -> str:
-    return PROMPT_MAPPING[model_type]
-
-
+""".format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY)
+PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
+def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
--- a/src/openllm/models/mpt/modeling_mpt.py
+++ b/src/openllm/models/mpt/modeling_mpt.py
@@ -15,189 +15,70 @@
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
-from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE
+from .configuration_mpt import DEFAULT_PROMPT_TEMPLATE, MPTPromptType
 from ..._prompt import default_formatter
-from ...utils import generate_labels
-from ...utils import is_triton_available
-
-
-if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
-
-    from .configuration_mpt import MPTPromptType
-else:
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+from ...utils import generate_labels, is_triton_available
+if t.TYPE_CHECKING: import transformers, torch
+else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
 logger = logging.getLogger(__name__)
-
-
-def get_mpt_config(
-    model_id_or_path: str,
-    max_sequence_length: int,
-    device: torch.device | str | int | None,
-    device_map: str | None = None,
-    trust_remote_code: bool = True,
-) -> transformers.PretrainedConfig:
+def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig:
    config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-    if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)):
-        config.init_device = str(device)
-    if hasattr(config, "attn_config") and is_triton_available():
-        config.attn_config["attn_impl"] = "triton"
-    else:
-        logger.debug(
-            "'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
-        )
+    if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
+    if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
+    else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'")
    # setting max_seq_len
    config.max_seq_len = max_sequence_length
    return config
-
-
 class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
    __openllm_internal__ = True
-
-    def llm_post_init(self):
-        self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-
+    def llm_post_init(self): self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        model_kwds = {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}
-        tokenizer_kwds = {"padding_side": "left"}
-        return model_kwds, tokenizer_kwds
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
    def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
        _, tokenizer_attrs = self.llm_parameters
-
        torch_dtype = attrs.pop("torch_dtype", self.dtype)
        device_map = attrs.pop("device_map", None)
        attrs.pop("low_cpu_mem_usage", None)
-
-        config = get_mpt_config(
-            self.model_id,
-            self.config.max_sequence_length,
-            self.device,
-            device_map=device_map,
-            trust_remote_code=trust_remote_code,
-        )
-
+        config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-        if tokenizer.pad_token_id is None:
-            logger.warning("pad_token_id is not set. Setting it to eos_token")
-            tokenizer.pad_token = tokenizer.eos_token
-
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            self.model_id,
-            config=config,
-            torch_dtype=torch_dtype,
-            trust_remote_code=trust_remote_code,
-            device_map=device_map,
-            **attrs,
-        )
-        try:
-            return bentoml.transformers.save_model(
-                self.tag,
-                model,
-                custom_objects={"tokenizer": tokenizer},
-                labels=generate_labels(self),
-            )
-        finally:
-            torch.cuda.empty_cache()
-
+        if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
+        model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
+        try: return bentoml.transformers.save_model( self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+        finally: torch.cuda.empty_cache()
    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
        torch_dtype = attrs.pop("torch_dtype", self.dtype)
        device_map = attrs.pop("device_map", None)
        trust_remote_code = attrs.pop("trust_remote_code", True)
-
-        _ref = bentoml.transformers.get(self.tag)
-        config = get_mpt_config(
-            _ref.path,
-            self.config.max_sequence_length,
-            self.device,
-            device_map=device_map,
-            trust_remote_code=trust_remote_code,
-        )
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            _ref.path,
-            config=config,
-            trust_remote_code=trust_remote_code,
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            **attrs,
-        )
+        config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs)
        model.tie_weights()
        return model
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        prompt_type: MPTPromptType | None = None,
-        use_default_prompt_template: bool = True,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters( self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            if prompt_type is None:
-                if "instruct" in self.model_id:
-                    prompt_type = "instruct"
-                elif "storywriter" in self.model_id:
-                    prompt_type = "storywriter"
-                elif "chat" in self.model_id:
-                    prompt_type = "chat"
-                else:
-                    prompt_type = "default"
+                if "instruct" in self.model_id: prompt_type = "instruct"
+                elif "storywriter" in self.model_id: prompt_type = "storywriter"
+                elif "chat" in self.model_id: prompt_type = "chat"
+                else: prompt_type = "default"
            _PROMPT = DEFAULT_PROMPT_TEMPLATE(prompt_type)
            template_variables = default_formatter.extract_template_variables(_PROMPT)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-        }
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = _PROMPT.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        generation_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        return generation_result[0]
-
+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
        llm_config = self.config.model_construct_env(**attrs)
-
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-
-        attrs = {
-            "do_sample": False if llm_config["temperature"] == 0 else True,
-            "eos_token_id": self.tokenizer.eos_token_id,
-            "pad_token_id": self.tokenizer.pad_token_id,
-            "generation_config": llm_config.to_generation_config(),
-        }
-
+        attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()}
        with torch.inference_mode():
            if torch.cuda.is_available():
                with torch.autocast("cuda", torch.float16):
                    generated_tensors = self.model.generate(**inputs, **attrs)
-            else:
-                generated_tensors = self.model.generate(**inputs, **attrs)
-
+            else: generated_tensors = self.model.generate(**inputs, **attrs)
        return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
--- a/src/openllm/models/opt/init.py
+++ b/src/openllm/models/opt/init.py
@@ -11,75 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_flax_available
 from ...utils import is_tf_available
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_opt": ["OPTConfig", "START_OPT_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_opt"] = ["OPT"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_opt"] = ["OPT"]
 try:
-    if not is_flax_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
-
+    if not is_flax_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
 try:
-    if not is_tf_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_tf_opt"] = ["TFOPT"]
-
-
+    if not is_tf_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_tf_opt"] = ["TFOPT"]
 if t.TYPE_CHECKING:
    from .configuration_opt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_opt import START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
    from .configuration_opt import OPTConfig as OPTConfig
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_opt import OPT as OPT
-
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_opt import OPT as OPT
    try:
-        if not is_flax_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_flax_opt import FlaxOPT as FlaxOPT
-
+        if not is_flax_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_flax_opt import FlaxOPT as FlaxOPT
    try:
-        if not is_tf_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_tf_opt import TFOPT as TFOPT
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_tf_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_tf_opt import TFOPT as TFOPT
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/opt/configuration_opt.py
+++ b/src/openllm/models/opt/configuration_opt.py
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
-
 import openllm
-
-
 class OPTConfig(openllm.LLMConfig):
    """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.

@@ -27,13 +23,12 @@ class OPTConfig(openllm.LLMConfig):

    Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "trust_remote_code": False,
        "url": "https://huggingface.co/docs/transformers/model_doc/opt",
        "default_id": "facebook/opt-1.3b",
-        "architecture": "MPTForCausalLM",
+        "architecture": "OPTForCausalLM",
        "model_ids": [
            "facebook/opt-125m",
            "facebook/opt-350m",
@@ -53,20 +48,12 @@ class OPTConfig(openllm.LLMConfig):
            },
        ),
    }
-
-    format_outputs: bool = openllm.LLMConfig.Field(
-        False,
-        description="""Whether to format the outputs. This
-    can be used when num_return_sequences > 1.""",
-    )
-
+    format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
    class GenerationConfig:
        top_k: int = 15
        temperature: float = 0.75
        max_new_tokens: int = 1024
        num_return_sequences: int = 1
-
-
 START_OPT_COMMAND_DOCSTRING = """\
 Run a LLMServer for OPT model.

@@ -92,5 +79,4 @@ or provide `--model-id` flag when running ``openllm start opt``:
 \b
 $ openllm start opt --model-id facebook/opt-6.7b
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/opt/modeling_flax_opt.py
+++ b/src/openllm/models/opt/modeling_flax_opt.py
@@ -11,109 +11,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
 from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
 from ...utils import generate_labels
-
-
-if t.TYPE_CHECKING:
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+if t.TYPE_CHECKING: import transformers
+else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
    __openllm_internal__ = True
-
    @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        tokenizer_kwds = {
-            "padding_side": "left",
-            "truncation_side": "left",
-        }
-        return {}, tokenizer_kwds
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}

    def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        config = transformers.AutoConfig.from_pretrained(self.model_id)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
+        config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
        tokenizer.pad_token_id = config.pad_token_id
-        model = t.cast(
-            "transformers.FlaxOPTForCausalLM",
-            transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
-        )
-        return bentoml.transformers.save_model(
-            self.tag,
-            model,
-            custom_objects={"tokenizer": tokenizer},
-            labels=generate_labels(self),
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+        return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-            "repetition_penalty": repetition_penalty,
-        }
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {}
    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        if len(generation_result) == 1:
-            if self.config.format_outputs:
-                logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
-            return generation_result[0]
-
-        if self.config.format_outputs:
-            return "Generated result:\n" + "\n -".join(generation_result)
-        else:
-            return "\n".join(generation_result)
-
-    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="np")
-        generated_tensors = self.model.generate(
-            **input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(generated_tensors.sequences, skip_special_tokens=True)
+        if len(generation_result) == 1: return generation_result[0]
+        if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+        else: return "\n".join(generation_result)
+    def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode( self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True)
--- a/src/openllm/models/opt/modeling_opt.py
+++ b/src/openllm/models/opt/modeling_opt.py
@@ -11,129 +11,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
-import bentoml
 import openllm
-
 from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
-from ...utils import generate_labels
-
-
 if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
+    import torch, transformers
 else:
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+    torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
    __openllm_internal__ = True
-
-    def llm_post_init(self):
-        self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-
+    def llm_post_init(self): self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        model_kwds = {
-            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-        }
-        tokenizer_kwds = {
-            "padding_side": "left",
-            "truncation_side": "left",
-        }
-        return model_kwds, tokenizer_kwds
-
-    def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        torch_dtype = attrs.pop("torch_dtype", self.dtype)
-
-        config = transformers.AutoConfig.from_pretrained(self.model_id)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-        tokenizer.pad_token_id = config.pad_token_id
-        model = t.cast(
-            "transformers.OPTForCausalLM",
-            transformers.AutoModelForCausalLM.from_pretrained(
-                self.model_id, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, **attrs
-            ),
-        )
-        return bentoml.transformers.save_model(
-            self.tag,
-            model,
-            custom_objects={"tokenizer": tokenizer},
-            labels=generate_labels(self),
-        )
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left", "truncation_side": "left"}
    def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
        torch_dtype = attrs.pop("torch_dtype", self.dtype)
-        model: transformers.OPTForCausalLM = transformers.AutoModelForCausalLM.from_pretrained(
-            bentoml.transformers.get(self.tag).path, *args, torch_dtype=torch_dtype, **attrs
-        )
+        model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, torch_dtype=torch_dtype, **attrs)
        return model
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-        }
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        if len(generation_result) == 1:
-            if self.config.format_outputs:
-                logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
-            return generation_result[0]
-
-        if self.config.format_outputs:
-            return "Generated result:\n" + "\n -".join(generation_result)
-        else:
-            return "\n".join(generation_result)
-
+        if len(generation_result) == 1: return generation_result[0]
+        if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+        else: return "\n".join(generation_result)
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        with torch.inference_mode():
-            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-            generated_tensors = self.model.generate(
-                **inputs,
-                do_sample=True,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
-            return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
+        with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/src/openllm/models/opt/modeling_tf_opt.py
+++ b/src/openllm/models/opt/modeling_tf_opt.py
@@ -11,107 +11,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
 from .configuration_opt import DEFAULT_PROMPT_TEMPLATE
 from ..._prompt import default_formatter
 from ...utils import generate_labels
-
-
-if t.TYPE_CHECKING:
-    import transformers
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
-
+if t.TYPE_CHECKING: import transformers
+else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-
 class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
    __openllm_internal__ = True
-
    @property
-    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]] | None:
-        tokenizer_kwds = {
-            "padding_side": "left",
-            "truncation_side": "left",
-        }
-        return {}, tokenizer_kwds
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {}, {"padding_side": "left", "truncation_side": "left"}
    def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        config = transformers.AutoConfig.from_pretrained(self.model_id)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
+        config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
        tokenizer.pad_token_id = config.pad_token_id
-        model: transformers.TFOPTForCausalLM = transformers.TFOPTForCausalLM.from_pretrained(
-            self.model_id, trust_remote_code=trust_remote_code, **attrs
-        )
-        return bentoml.transformers.save_model(
-            self.tag,
-            model,
-            custom_objects={"tokenizer": tokenizer},
-            labels=generate_labels(self),
-        )
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        max_new_tokens: int | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        num_return_sequences: int | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+        return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+    def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
-            try:
-                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
-                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
-                ) from None
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "num_return_sequences": num_return_sequences,
-        }
-        return prompt_text, generation_config, {}
-
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
+            try: prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_default_prompt_template=False' to disable the default prompt template.") from None
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
-        if len(generation_result) == 1:
-            if self.config.format_outputs:
-                logger.warning("'format_outputs' doesn't have any effect when 'num_return_sequences=1'")
-            return generation_result[0]
-
-        if self.config.format_outputs:
-            return "Generated result:\n" + "\n -".join(generation_result)
-        else:
-            return "\n".join(generation_result)
-
-    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-        input_ids = self.tokenizer(prompt, return_tensors="tf")
-        generated_tensors = self.model.generate(
-            **input_ids,
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        )
-        return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True)
+        if len(generation_result) == 1: return generation_result[0]
+        if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
+        else: return "\n".join(generation_result)
+    def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True)
--- a/src/openllm/models/stablelm/init.py
+++ b/src/openllm/models/stablelm/init.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_stablelm": ["StableLMConfig", "START_STABLELM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_stablelm"] = ["StableLM"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_stablelm"] = ["StableLM"]
 if t.TYPE_CHECKING:
    from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_stablelm import START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
    from .configuration_stablelm import StableLMConfig as StableLMConfig
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_stablelm import StableLM as StableLM
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_stablelm import StableLM as StableLM
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/src/openllm/models/stablelm/configuration_stablelm.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class StableLMConfig(openllm.LLMConfig):
    """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.

@@ -30,7 +27,6 @@ class StableLMConfig(openllm.LLMConfig):
    and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
    for more information.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "url": "https://github.com/Stability-AI/StableLM",
@@ -43,14 +39,11 @@ class StableLMConfig(openllm.LLMConfig):
            "stabilityai/stablelm-base-alpha-7b",
        ],
    }
-
    class GenerationConfig:
        temperature: float = 0.9
        max_new_tokens: int = 128
        top_k: int = 0
        top_p: float = 0.9
-
-
 START_STABLELM_COMMAND_DOCSTRING = """\
 Run a LLMServer for StableLM model.

@@ -70,12 +63,10 @@ or provide `--model-id` flag when running ``openllm start stablelm``:
 \b
 $ openllm start stablelm --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 """
-
 SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
 - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
 - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
 - StableLM will refuse to participate in anything that could harm a human.
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -14,91 +14,27 @@
 from __future__ import annotations
 import logging
 import typing as t
-
 import openllm
-
 from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE
 from .configuration_stablelm import SYSTEM_PROMPT
 from ..._prompt import default_formatter
-
-
-if t.TYPE_CHECKING:
-    import transformers  # noqa
-    import torch
-else:
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-
-
+if t.TYPE_CHECKING: import transformers, torch
+else: transformers, torch = openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("torch", globals(), "torch")
 logger = logging.getLogger(__name__)
-
-
 class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
    __openllm_internal__ = True
-
-    def llm_post_init(self):
-        self.bettertransformer = True if not torch.cuda.is_available() else False
-
+    def llm_post_init(self): self.bettertransformer = True if not torch.cuda.is_available() else False
    @property
-    def import_kwargs(self):
-        model_kwds = {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}
-        tokenizer_kwds: dict[str, t.Any] = {}
-        return model_kwds, tokenizer_kwds
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        temperature: float | None = None,
-        max_new_tokens: int | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        use_default_prompt_template: bool = False,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if "tuned" in self._model_id and use_default_prompt_template:
-            prompt_variables = {
-                k: v
-                for k, v in attrs.items()
-                if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
-            }
-            if "instruction" in prompt_variables:
-                raise RuntimeError(
-                    "'instruction' should be passed as the first argument "
-                    "instead of kwargs when 'use_default_prompt_template=True'"
-                )
+            prompt_variables = {k: v for k, v in attrs.items() if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)}
+            if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_default_prompt_template=True'")
            system_prompt = prompt_variables.pop("system_prompt", SYSTEM_PROMPT)
            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, system_prompt=system_prompt)
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-        }
-
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
-        return generation_result[0]
-
+        else: prompt_text = prompt
+        return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
+    def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
        from ..._generation import StopOnTokens
-
-        generation_kwargs = {
-            "do_sample": True,
-            "generation_config": self.config.model_construct_env(**attrs).to_generation_config(),
-            "pad_token_id": self.tokenizer.eos_token_id,
-            "stopping_criteria": transformers.StoppingCriteriaList([StopOnTokens()]),
-        }
-
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-
-        with torch.inference_mode():
-            if torch.cuda.is_available():
-                with torch.autocast("cuda", torch.float16):
-                    tokens = self.model.generate(**inputs, **generation_kwargs)
-            else:
-                tokens = self.model.generate(**inputs, **generation_kwargs)
-        return [self.tokenizer.decode(tokens[0], skip_special_tokens=True)]
+        with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=transformers.StoppingCriteriaList([StopOnTokens()]))[0], skip_special_tokens=True)]
--- a/src/openllm/models/starcoder/init.py
+++ b/src/openllm/models/starcoder/init.py
@@ -11,40 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
+import sys
 import typing as t
-
 from ...exceptions import MissingDependencyError
 from ...utils import LazyModule
 from ...utils import is_torch_available
-
-
-_import_structure: dict[str, list[str]] = {
-    "configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"],
-}
-
+_import_structure: dict[str, list[str]] = {"configuration_starcoder": ["StarCoderConfig", "START_STARCODER_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
 try:
-    if not is_torch_available():
-        raise MissingDependencyError
-except MissingDependencyError:
-    pass
-else:
-    _import_structure["modeling_starcoder"] = ["StarCoder"]
-
+    if not is_torch_available(): raise MissingDependencyError
+except MissingDependencyError: pass
+else: _import_structure["modeling_starcoder"] = ["StarCoder"]
 if t.TYPE_CHECKING:
    from .configuration_starcoder import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE
    from .configuration_starcoder import START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING
    from .configuration_starcoder import StarCoderConfig as StarCoderConfig
-
    try:
-        if not is_torch_available():
-            raise MissingDependencyError
-    except MissingDependencyError:
-        pass
-    else:
-        from .modeling_starcoder import StarCoder as StarCoder
-else:
-    import sys
-
-    sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+        if not is_torch_available(): raise MissingDependencyError
+    except MissingDependencyError: pass
+    else: from .modeling_starcoder import StarCoder as StarCoder
+else: sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/src/openllm/models/starcoder/configuration_starcoder.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-
 import openllm
-
-
 class StarCoderConfig(openllm.LLMConfig):
    """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.

@@ -25,7 +22,6 @@ class StarCoderConfig(openllm.LLMConfig):

    Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
    """
-
    __config__ = {
        "name_type": "lowercase",
        "requires_gpu": True,
@@ -36,7 +32,6 @@ class StarCoderConfig(openllm.LLMConfig):
        "default_id": "bigcode/starcoder",
        "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"],
    }
-
    class GenerationConfig:
        temperature: float = 0.2
        max_new_tokens: int = 256
@@ -45,8 +40,6 @@ class StarCoderConfig(openllm.LLMConfig):
        top_p: float = 0.95
        pad_token_id: int = 49152
        repetition_penalty: float = 1.2
-
-
 START_STARCODER_COMMAND_DOCSTRING = """\
 Run a LLMServer for StarCoder model.

@@ -66,5 +59,4 @@ or provide `--model-id` flag when running ``openllm start starcoder``:
 \b
 $ openllm start starcoder --model-id 'bigcode/starcoder'
 """
-
 DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -14,143 +14,53 @@
 from __future__ import annotations
 import logging
 import typing as t
-
 import bentoml
 import openllm
-
 from ...utils import generate_labels
-
-
 if t.TYPE_CHECKING:
-    import torch
-
-    import transformers
+    import torch, transformers
 else:
-    torch = openllm.utils.LazyLoader("torch", globals(), "torch")
-    transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
-
+    torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
 logger = logging.getLogger(__name__)
-
-FIM_PREFIX = "<fim-prefix>"
-FIM_MIDDLE = "<fim-middle>"
-FIM_SUFFIX = "<fim-suffix>"
-FIM_PAD = "<fim-pad>"
-EOD = "<|endoftext|>"
-FIM_INDICATOR = "<FILL_HERE>"
-
-
+FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
 class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
    __openllm_internal__ = True
-
    @property
-    def import_kwargs(self):
-        model_kwds = {
-            "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
-        }
-        tokenizer_kwds = {"padding_side": "left"}
-        return model_kwds, tokenizer_kwds
-
+    def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {"padding_side": "left"}
    def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-        _, tokenizer_attrs = self.llm_parameters
-
-        torch_dtype = attrs.pop("torch_dtype", torch.float16)
-        device_map = attrs.pop("device_map", "auto")
-
-        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
-        tokenizer.add_special_tokens(
-            {
-                "additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
-                "pad_token": EOD,
-            }
-        )
-
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs
-        )
-        try:
-            return bentoml.transformers.save_model(
-                self.tag,
-                model,
-                custom_objects={"tokenizer": tokenizer},
-                labels=generate_labels(self),
-            )
-        finally:
-            # NOTE: We need to free the cache after saving here so that we can load it back later on.
-            torch.cuda.empty_cache()
-
-    def sanitize_parameters(
-        self,
-        prompt: str,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        max_new_tokens: int | None = None,
-        repetition_penalty: float | None = None,
-        **attrs: t.Any,
-    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        fim_mode = FIM_INDICATOR in prompt
-        prefix, suffix = None, None
+        torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
+        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+        tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
+        model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
+        try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+        finally: torch.cuda.empty_cache()
+    def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+        fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
        if fim_mode:
-            try:
-                prefix, suffix = prompt.split(FIM_INDICATOR)
-            except Exception as err:
-                logger.error("Error while processing prompt with FIM mode:\n", exc_info=err)
-                raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
+            try: prefix, suffix = prompt.split(FIM_INDICATOR)
+            except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
            prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
-        else:
-            prompt_text = prompt
-
-        generation_config = {
-            "temperature": temperature,
-            "top_p": top_p,
-            "max_new_tokens": max_new_tokens,
-            "repetition_penalty": repetition_penalty,
-            # XXX: This value is currently a hack, need more investigate why the
-            # default starcoder doesn't include the same value as santacoder EOD
-            "pad_token_id": 49152,
-            **attrs,
-        }
-
-        return prompt_text, generation_config, {}
-
-    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
-        return generation_result[0]
+        else: prompt_text = prompt
+        # XXX: This value for pad_token_id is currently a hack, need more investigate why the
+        # default starcoder doesn't include the same value as santacoder EOD
+        return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}

+    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
        with torch.inference_mode():
-            inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
-            result_tensor = self.model.generate(
-                inputs,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id,
-                # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+            # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
+            # NOTE: support fine-tuning starcoder
+            result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
            # TODO: We will probably want to return the tokenizer here so that we can manually process this
            # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
-            return self.tokenizer.batch_decode(
-                result_tensor[0],
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=True,
-            )
-
-    def generate_one(
-        self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any
-    ) -> list[dict[t.Literal["generated_text"], str]]:
+            return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
        from ..._generation import StopSequenceCriteria
-
-        max_new_tokens = preprocess_generate_kwds.pop("max_new_tokens", 200)
-        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        src_len = encoded_inputs["input_ids"].shape[1]
-        stopping_criteria = preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
+        max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", transformers.StoppingCriteriaList([]))
        stopping_criteria.append(StopSequenceCriteria(stop, self.tokenizer))
-        outputs = self.model.generate(
-            encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria
-        )
-
-        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
+        result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
        # Inference API returns the stop sequence
        for stop_seq in stop:
-            if result.endswith(stop_seq):
-                result = result[: -len(stop_seq)]
+            if result.endswith(stop_seq): result = result[: -len(stop_seq)]
        return [{"generated_text": result}]
--- a/src/openllm/serialisation/transformers.py
+++ b/src/openllm/serialisation/transformers.py
@@ -193,6 +193,10 @@ def import_model(
    if _tokenizer.pad_token is None:
        _tokenizer.pad_token = _tokenizer.eos_token

+    # NOTE: quick hack to set the loaded into llm object
+    object.__setattr__(llm, "__llm_model__", model)
+    object.__setattr__(llm, "__llm_tokenizer__", _tokenizer)
+
    try:
        with bentoml.models.create(
            llm.tag,
@@ -210,9 +214,7 @@ def import_model(
            else None,
            metadata=metadata,
        ) as bentomodel:
-            save_pretrained(
-                llm, bentomodel.path, model=model, tokenizer=_tokenizer, safe_serialization=safe_serialisation
-            )
+            save_pretrained(llm, bentomodel.path, safe_serialization=safe_serialisation)
            return bentomodel
    finally:
        # NOTE: We need to free up the cache after importing the model
@@ -296,12 +298,12 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
        or getattr(model, "is_loaded_in_4bit", False)
        or getattr(model, "is_quantized", False)
    )
-    if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
+    if torch.cuda.is_available() and not loaded_in_kbit:
        try:
            model = model.to("cuda")
        except torch.cuda.OutOfMemoryError as err:
            raise RuntimeError(
-                f"Failed to fit {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
+                f"Failed to convert {llm.config['model_name']} with model_id '{llm.model_id}' to CUDA.\nNote: You can try out '--quantize int8 | int4' for dynamic quantization."
            ) from err
    if llm.bettertransformer and llm.__llm_implementation__ == "pt" and not isinstance(model, _transformers.Pipeline):
        # BetterTransformer is currently only supported on PyTorch.
@@ -314,27 +316,19 @@ def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
 def save_pretrained(
    llm: openllm.LLM[M, T],
    save_directory: str,
-    model: M | None = None,
-    tokenizer: T | None = None,
    is_main_process: bool = True,
    state_dict: DictStrAny | None = None,
    save_function: t.Callable[..., None] | None = None,
    push_to_hub: bool = False,
-    max_shard_size: int | str = "10GB",
+    max_shard_size: int | str = "2GB",
    safe_serialization: bool = False,
    variant: str | None = None,
    **attrs: t.Any,
 ) -> None:
    """Light wrapper around ``transformers.PreTrainedTokenizer.save_pretrained`` and ``transformers.PreTrainedModel.save_pretrained``."""
-    model = first_not_none(model, default=llm.__llm_model__)
-    tokenizer = first_not_none(tokenizer, default=llm.__llm_tokenizer__)
    save_function = first_not_none(save_function, default=torch.save)
    model_save_attrs, tokenizer_save_attrs = normalize_attrs_to_model_tokenizer_pair(**attrs)
    safe_serialization = safe_serialization or llm._serialisation_format == "safetensors"
-
-    if model is None or tokenizer is None:
-        raise RuntimeError("Failed to find loaded model or tokenizer to save to local store.")
-
    if llm._quantize_method == "gptq":
        if not is_autogptq_available():
            raise OpenLLMException(
@@ -342,11 +336,11 @@ def save_pretrained(
            )
        if llm.config["model_type"] != "causal_lm":
            raise OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-        model.save_quantized(save_directory, use_safetensors=safe_serialization)
-    elif isinstance(model, _transformers.Pipeline):
-        model.save_pretrained(save_directory, safe_serialization=safe_serialization)
+        llm.model.save_quantized(save_directory, use_safetensors=safe_serialization)
+    elif isinstance(llm.model, _transformers.Pipeline):
+        llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
    else:
-        model.save_pretrained(
+        llm.model.save_pretrained(
            save_directory,
            is_main_process=is_main_process,
            state_dict=state_dict,
@@ -357,4 +351,4 @@ def save_pretrained(
            variant=variant,
            **model_save_attrs,
        )
-    tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
+    llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
--- a/src/openllm_client/runtimes/base.py
+++ b/src/openllm_client/runtimes/base.py
@@ -90,6 +90,10 @@ class ClientMeta(t.Generic[T]):

    @property
    def _hf_agent(self) -> transformers.HfAgent:
+        if not self.supports_hf_agent:
+            raise openllm.exceptions.OpenLLMException(
+                f"{self.model_name} ({self.framework}) does not support running HF agent."
+            )
        if self.__agent__ is None:
            if not openllm.utils.is_transformers_supports_agent():
                raise RuntimeError(
@@ -130,6 +134,16 @@ class ClientMeta(t.Generic[T]):
    def configuration(self) -> dict[str, t.Any]:
        raise NotImplementedError

+    @property
+    @abstractmethod
+    def supports_embeddings(self) -> bool:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def supports_hf_agent(self) -> bool:
+        raise NotImplementedError
+
    @property
    def llm(self) -> openllm.LLM[t.Any, t.Any]:
        if self.__llm__ is None:
--- a/src/openllm_client/runtimes/grpc.py
+++ b/src/openllm_client/runtimes/grpc.py
@@ -80,6 +80,20 @@ class GrpcClientMixin:
        except KeyError:
            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None

+    @property
+    def supports_embeddings(self) -> bool:
+        try:
+            return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
+    @property
+    def supports_hf_agent(self) -> bool:
+        try:
+            return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
    def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
        if isinstance(result, dict):
            return openllm.GenerationOutput(**result)
--- a/src/openllm_client/runtimes/http.py
+++ b/src/openllm_client/runtimes/http.py
@@ -77,6 +77,20 @@ class HTTPClientMixin:
        except KeyError:
            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None

+    @property
+    def supports_embeddings(self) -> bool:
+        try:
+            return self._metadata.get("supports_embeddings", False)
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
+    @property
+    def supports_hf_agent(self) -> bool:
+        try:
+            return self._metadata.get("supports_hf_agent", False)
+        except KeyError:
+            raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
+
    def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput:
        return openllm.GenerationOutput(**result)

--- a/typings/attr/init.pyi
+++ b/typings/attr/init.pyi
@@ -12,6 +12,7 @@ from typing import Protocol
 from typing import Sequence
 from typing import Tuple
 from typing import Type
+from typing import TypeAlias
 from typing import TypeGuard
 from typing import TypeVar
 from typing import Union
@@ -40,16 +41,16 @@ __copyright__: str
 _T = TypeVar("_T")
 _C = TypeVar("_C", bound=type)
 _P = ParamSpec("_P")
-_EqOrderType = Union[bool, Callable[[Any], Any]]
-_ValidatorType = Callable[[Any, Attribute[_T], _T], Any]
-_ConverterType = Callable[[Any], Any]
-_FilterType = Callable[[Attribute[_T], _T], bool]
-_ReprType = Callable[[Any], str]
-_ReprArgType = Union[bool, _ReprType]
-_OnSetAttrType = Callable[[Any, Attribute[Any], Any], Any]
-_OnSetAttrArgType = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
-_FieldTransformer = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
-_ValidatorArgType = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]
+_EqOrderType: TypeAlias = Union[bool, Callable[[Any], Any]]
+_ValidatorType: TypeAlias = Callable[[Any, Attribute[_T], _T], Any]
+_ConverterType: TypeAlias = Callable[[Any], Any]
+_FilterType: TypeAlias = Callable[[Attribute[_T], _T], bool]
+_ReprType: TypeAlias = Callable[[Any], str]
+_ReprArgType: TypeAlias = Union[bool, _ReprType]
+_OnSetAttrType: TypeAlias = Callable[[Any, Attribute[Any], Any], Any]
+_OnSetAttrArgType: TypeAlias = Union[_OnSetAttrType, List[_OnSetAttrType], setters._NoOpType]
+_FieldTransformer: TypeAlias = Callable[[type, List[Attribute[Any]]], List[Attribute[Any]]]
+_ValidatorArgType: TypeAlias = Union[_ValidatorType[_T], Sequence[_ValidatorType[_T]]]

 class AttrsInstance(AttrsInstance_, Protocol): ...

@@ -535,8 +536,10 @@ def get_run_validators() -> bool: ...

 # aliases --

-s = attributes = attrs
-ib = attr = attrib
+s = attrs
+attributes = attrs
+ib = attrib
+attr = attrib
 dataclass = attrs  # Technically, partial(attrs, auto_attribs=True) ;)

 class ReprProtocol(Protocol):
--- a/typings/attr/_cmp.pyi
+++ b/typings/attr/_cmp.pyi
@@ -1,8 +1,9 @@
 from typing import Any
 from typing import Callable
 from typing import Optional
+from typing import TypeAlias

-_CompareWithType = Callable[[Any, Any], bool]
+_CompareWithType: TypeAlias = Callable[[Any, Any], bool]

 def cmp_using(
    eq: Optional[_CompareWithType] = ...,
--- a/typings/attr/_compat.pyi
+++ b/typings/attr/_compat.pyi
@@ -1,5 +1,5 @@
-from typing import Any
 import threading
+from typing import Any

 def set_closure_cell(cell: Any, value: Any) -> None: ...

--- a/typings/attr/_make.pyi
+++ b/typings/attr/_make.pyi
@@ -1,4 +1,4 @@
 from . import _CountingAttr as _CountingAttr
-from . import _make_repr as _make_repr
 from . import _make_init as _make_init
+from . import _make_repr as _make_repr
 from . import _transform_attrs as _transform_attrs
--- a/typings/click_option_group/_core.pyi
+++ b/typings/click_option_group/_core.pyi
@@ -15,7 +15,7 @@ import click

 _R = TypeVar("_R")
 _T = TypeVar("_T")
-AnyCallable = Callable[..., Any]
+AnyCallable: TypeAlias = Callable[..., Any]
 Decorator: TypeAlias = Callable[[_T], _T]
 _FC = TypeVar("_FC", bound=Union[AnyCallable, click.Command])

--- a/typings/deepmerge/merger.pyi
+++ b/typings/deepmerge/merger.pyi
@@ -2,6 +2,7 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Tuple
+from typing import TypeAlias
 from typing import Union

 from .strategy.core import StrategyList
@@ -9,7 +10,7 @@ from .strategy.dict import DictStrategies
 from .strategy.list import ListStrategies
 from .strategy.set import SetStrategies

-ConfigDictType = Dict[str, Any]
+ConfigDictType: TypeAlias = Dict[str, Any]

 class Merger:
    PROVIDED_TYPE_STRATEGIES: Dict[type, Union[ListStrategies, DictStrategies, SetStrategies]] = ...
--- a/typings/deepmerge/strategy/core.pyi
+++ b/typings/deepmerge/strategy/core.pyi
@@ -2,9 +2,10 @@ from typing import Any
 from typing import Callable
 from typing import List
 from typing import Optional
+from typing import TypeAlias
 from typing import Union

-_StringOrFunction = Union[str, Callable[..., Any]]
+_StringOrFunction: TypeAlias = Union[str, Callable[..., Any]]
 STRATEGY_END: object = ...

 class StrategyList:
--- a/typings/jupytext/config.pyi
+++ b/typings/jupytext/config.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Generator
 from typing import Any
 from typing import Dict
-from collections.abc import Generator

 from _typeshed import Incomplete

--- a/typings/jupytext/formats.pyi
+++ b/typings/jupytext/formats.pyi
@@ -1,4 +1,5 @@
 from typing import Any
+
 from _typeshed import Incomplete

 class JupytextFormatError(ValueError): ...