From c33b071ee4785d1f7c1f80a6796a32b97552a82f Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 21 Nov 2023 04:39:48 -0500
Subject: [PATCH] refactor: delete unused code (#716)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 mypy.ini                                      |   2 +-
 openllm-client/src/openllm_client/_utils.pyi  |   1 -
 .../src/openllm_core/utils/__init__.py        |  54 +---
 openllm-python/src/openllm/__init__.py        |  11 -
 openllm-python/src/openllm/__init__.pyi       |  10 -
 openllm-python/src/openllm/__main__.py        |  15 +-
 openllm-python/src/openllm/_deprecated.py     |  13 +-
 openllm-python/src/openllm/_generation.py     |  17 --
 openllm-python/src/openllm/_generation.pyi    |  22 +-
 openllm-python/src/openllm/_llm.py            |   6 +-
 openllm-python/src/openllm/_llm.pyi           |   2 -
 openllm-python/src/openllm/_runners.py        |  23 +-
 openllm-python/src/openllm/_strategies.py     | 242 +++++++---------
 openllm-python/src/openllm/client.py          |   9 +-
 openllm-python/src/openllm/testing.py         |  75 -----
 .../openllm/{utils/__init__.py => utils.py}   |   0
 .../openllm/{utils/__init__.pyi => utils.pyi} |   1 -
 openllm-python/src/openllm_cli/_factory.py    |   6 +-
 openllm-python/src/openllm_cli/_sdk.py        |   2 +-
 openllm-python/src/openllm_cli/entrypoint.py  | 115 +-------
 openllm-python/tests/models/__init__.py       |   0
 .../flan_t5_test/test_flan_t5[container].json |  33 ---
 .../flan_t5_test/test_flan_t5[local].json     |  33 ---
 .../opt_test/test_opt_125m[container].json    |  34 ---
 .../opt_test/test_opt_125m[local].json        |  34 ---
 openllm-python/tests/models/conftest.py       | 266 ------------------
 openllm-python/tests/models/flan_t5_test.py   |  40 ---
 openllm-python/tests/models/opt_test.py       |  40 ---
 28 files changed, 149 insertions(+), 957 deletions(-)
 delete mode 100644 openllm-python/src/openllm/testing.py
 rename openllm-python/src/openllm/{utils/__init__.py => utils.py} (100%)
 rename openllm-python/src/openllm/{utils/__init__.pyi => utils.pyi} (98%)
 delete mode 100644 openllm-python/tests/models/__init__.py
 delete mode 100644 openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json
 delete mode 100644 openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json
 delete mode 100644 openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json
 delete mode 100644 openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json
 delete mode 100644 openllm-python/tests/models/conftest.py
 delete mode 100644 openllm-python/tests/models/flan_t5_test.py
 delete mode 100644 openllm-python/tests/models/opt_test.py

diff --git a/mypy.ini b/mypy.ini
index c63d8cc2..d8b92f8e 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -8,4 +8,4 @@ warn_unused_configs = true
 ignore_missing_imports = true
 check_untyped_defs = true
 warn_unreachable = true
-files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/utils/__init__.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi
+files = openllm-python/src/openllm/bundle/__init__.pyi, openllm-python/src/openllm/serialisation/__init__.pyi, openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-python/src/openllm/__init__.pyi, openllm-client/src/openllm_client/_typing_compat.py, openllm-core/src/openllm_core/_typing_compat.py, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/bundle/_package.pyi, openllm-python/src/openllm/_runners.pyi, openllm-python/src/openllm/_quantisation.pyi, openllm-python/src/openllm/_llm.pyi, openllm-python/src/openllm/_generation.pyi, openllm-python/src/openllm/entrypoints/openai.pyi, openllm-python/src/openllm/entrypoints/__init__.pyi, openllm-python/src/openllm/entrypoints/hf.pyi, openllm-python/src/openllm/entrypoints/_openapi.pyi, openllm-python/src/openllm/entrypoints/cohere.pyi, openllm-python/src/openllm/_service_vars.pyi, openllm-python/src/openllm/serialisation/_helpers.pyi, openllm-python/src/openllm/utils.pyi
diff --git a/openllm-client/src/openllm_client/_utils.pyi b/openllm-client/src/openllm_client/_utils.pyi
index 6aa84fc2..3e2e5f25 100644
--- a/openllm-client/src/openllm_client/_utils.pyi
+++ b/openllm-client/src/openllm_client/_utils.pyi
@@ -22,7 +22,6 @@ from openllm_core.utils import (
   getenv as getenv,
   in_notebook as in_notebook,
   lenient_issubclass as lenient_issubclass,
-  reserve_free_port as reserve_free_port,
   resolve_filepath as resolve_filepath,
   resolve_user_filepath as resolve_user_filepath,
   serde as serde,
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index faa92acb..d0963bf1 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -1,12 +1,9 @@
 from __future__ import annotations
-import contextlib
 import functools
 import hashlib
 import logging
 import logging.config
 import os
-import random
-import socket
 import sys
 import types
 import typing as t
@@ -19,7 +16,6 @@ from .lazy import LazyLoader as LazyLoader, LazyModule as LazyModule, VersionInf
 from .._typing_compat import overload as _overload
 
 if t.TYPE_CHECKING:
-  from bentoml._internal.models.model import ModelContext
   from bentoml._internal.types import PathType
   from openllm_core._typing_compat import AnyCallable
 
@@ -36,52 +32,6 @@ _object_setattr = object.__setattr__
 logger = logging.getLogger(__name__)
 
 
-@contextlib.contextmanager
-def reserve_free_port(
-  host: str = 'localhost',
-  port: int | None = None,
-  prefix: str | None = None,
-  max_retry: int = 50,
-  enable_so_reuseport: bool = False,
-) -> t.Iterator[int]:
-  """
-  detect free port and reserve until exit the context
-  """
-  import psutil
-
-  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-  if enable_so_reuseport:
-    if psutil.WINDOWS:
-      sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    elif psutil.MACOS or psutil.FREEBSD:
-      sock.setsockopt(socket.SOL_SOCKET, 0x10000, 1)  # SO_REUSEPORT_LB
-    else:
-      sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
-      if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
-        raise RuntimeError('Failed to set SO_REUSEPORT.') from None
-  if prefix is not None:
-    prefix_num = int(prefix) * 10 ** (5 - len(prefix))
-    suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix)))
-    for _ in range(max_retry):
-      suffix = random.randint(0, suffix_range)
-      port = int(f'{prefix_num + suffix}')
-      try:
-        sock.bind((host, port))
-        break
-      except OSError:
-        continue
-    else:
-      raise RuntimeError(f'Cannot find free port with prefix {prefix} after {max_retry} retries.') from None
-  elif port:
-    sock.bind((host, port))
-  else:
-    sock.bind((host, 0))
-  try:
-    yield sock.getsockname()[1]
-  finally:
-    sock.close()
-
-
 # fmt: off
 _T=t.TypeVar('_T')
 @functools.lru_cache(maxsize=1)
@@ -133,7 +83,7 @@ def set_disable_warnings(disable:bool=True)->None:
   if get_disable_warnings():os.environ[WARNING_ENV_VAR]=str(disable)
 def set_debug_mode(enabled:bool,level:int=1)->None:
   if enabled:os.environ[DEV_DEBUG_VAR] = str(level)
-  os.environ.update({DEBUG_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'})
+  os.environ.update({DEBUG_ENV_VAR:str(enabled),QUIET_ENV_VAR:str(not enabled),_GRPC_DEBUG_ENV_VAR:'DEBUG' if enabled else 'ERROR','CT2_VERBOSE':'3'})
   set_disable_warnings(enabled)
 def set_quiet_mode(enabled:bool)->None:
   os.environ.update({QUIET_ENV_VAR:str(enabled),_GRPC_DEBUG_ENV_VAR:'NONE','CT2_VERBOSE':'-1'})
@@ -151,7 +101,7 @@ def first_not_none(*args:_T|None,default:_T)->_T:...
 @_overload
 def first_not_none(*args:_T|None)->_T|None:...
 def first_not_none(*args:_T|None,default:_T|None=None)->_T|None:return next((arg for arg in args if arg is not None),default)
-def generate_context(framework_name:str)->ModelContext:
+def generate_context(framework_name:str):
   from bentoml._internal.models.model import ModelContext
   framework_versions={'transformers':pkg.get_pkg_version('transformers'),'safetensors':pkg.get_pkg_version('safetensors'),'optimum':pkg.get_pkg_version('optimum'),'accelerate':pkg.get_pkg_version('accelerate')}
   if iutils.is_torch_available():framework_versions['torch']=pkg.get_pkg_version('torch')
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index 2703d9ec..05bc3d54 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -3,14 +3,12 @@ import os as _os
 import pathlib as _pathlib
 import warnings as _warnings
 
-import openllm_cli as _cli
 from openllm_cli import _sdk
 
 from . import utils as utils
 
 if utils.DEBUG:
   utils.set_debug_mode(True)
-  utils.set_quiet_mode(False)
   _logging.basicConfig(level=_logging.NOTSET)
 else:
   # configuration for bitsandbytes before import
@@ -47,18 +45,9 @@ __lazy = utils.LazyModule(
     'serialisation': ['ggml', 'transformers'],
     '_quantisation': ['infer_quantisation_config'],
     '_llm': ['LLM'],
-    '_generation': [
-      'StopSequenceCriteria',
-      'StopOnTokens',
-      'prepare_logits_processor',
-      'get_context_length',
-      'is_sentence_complete',
-      'is_partial_stop',
-    ],
   },
   extra_objects={
     'COMPILED': COMPILED,
-    'cli': _cli,
     'start': _sdk.start,
     'start_grpc': _sdk.start_grpc,
     'build': _sdk.build,
diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi
index 11047560..a90cb999 100644
--- a/openllm-python/src/openllm/__init__.pyi
+++ b/openllm-python/src/openllm/__init__.pyi
@@ -16,7 +16,6 @@ from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_N
 # update-config-stubs.py: import stubs stop
 # fmt: on
 
-import openllm_cli as _cli
 from openllm_cli._sdk import (
   build as build,
   import_model as import_model,
@@ -44,14 +43,6 @@ from . import (
   utils as utils,
 )
 from ._deprecated import Runner as Runner
-from ._generation import (
-  StopOnTokens as StopOnTokens,
-  StopSequenceCriteria as StopSequenceCriteria,
-  prepare_logits_processor as prepare_logits_processor,
-  is_partial_stop as is_partial_stop,
-  is_sentence_complete as is_sentence_complete,
-  get_context_length as get_context_length,
-)
 from ._llm import LLM as LLM
 from ._quantisation import infer_quantisation_config as infer_quantisation_config
 from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
@@ -60,5 +51,4 @@ from .entrypoints import mount_entrypoints as mount_entrypoints
 from .protocol import openai as openai
 from .serialisation import ggml as ggml, transformers as transformers
 
-cli = _cli
 COMPILED: bool = ...
diff --git a/openllm-python/src/openllm/__main__.py b/openllm-python/src/openllm/__main__.py
index 2babfd90..99866462 100644
--- a/openllm-python/src/openllm/__main__.py
+++ b/openllm-python/src/openllm/__main__.py
@@ -1,13 +1,2 @@
-"""CLI entrypoint for OpenLLM.
-
-Usage:
-    openllm --help
-
-To start any OpenLLM model:
-    openllm start <model_name> --options ...
-"""
-
-if __name__ == '__main__':
-  from openllm_cli.entrypoint import cli
-
-  cli()
+# fmt: off
+if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()  # noqa
diff --git a/openllm-python/src/openllm/_deprecated.py b/openllm-python/src/openllm/_deprecated.py
index b3564568..c4612209 100644
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -6,7 +6,10 @@ import warnings
 
 import openllm
 from openllm_core._typing_compat import LiteralBackend, ParamSpec
-from openllm_core.utils import first_not_none, is_vllm_available
+from openllm_core.utils import first_not_none, getenv, is_vllm_available
+
+if t.TYPE_CHECKING:
+  from ._runners import Runner as _Runner
 
 P = ParamSpec('P')
 
@@ -20,7 +23,7 @@ def Runner(
   backend: LiteralBackend | None = None,
   llm_config: openllm.LLMConfig | None = None,
   **attrs: t.Any,
-) -> openllm.LLMRunner[t.Any, t.Any]:
+) -> _Runner[t.Any, t.Any]:
   """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
 
   > [!WARNING]
@@ -73,9 +76,9 @@ def Runner(
   attrs.update(
     {
       'model_id': model_id,
-      'quantize': os.getenv('OPENLLM_QUANTIZE', attrs.get('quantize', None)),
-      'serialisation': first_not_none(
-        attrs.get('serialisation'), os.environ.get('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']
+      'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),
+      'serialisation': getenv(
+        'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
       ),
     }
   )
diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py
index c3b4fa5b..c4f87c5b 100644
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -1,23 +1,6 @@
 import transformers
 
 
-class StopSequenceCriteria(transformers.StoppingCriteria):
-  def __init__(self, stop_sequences, tokenizer):
-    if isinstance(stop_sequences, str):
-      stop_sequences = [stop_sequences]
-    self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
-
-  def __call__(self, input_ids, scores, **kwargs):
-    return any(
-      self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences
-    )
-
-
-class StopOnTokens(transformers.StoppingCriteria):
-  def __call__(self, input_ids, scores, **kwargs):
-    return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
-
-
 def prepare_logits_processor(config):
   generation_config = config.generation_config
   logits_processor = transformers.LogitsProcessorList()
diff --git a/openllm-python/src/openllm/_generation.pyi b/openllm-python/src/openllm/_generation.pyi
index c727f6be..845346f5 100644
--- a/openllm-python/src/openllm/_generation.pyi
+++ b/openllm-python/src/openllm/_generation.pyi
@@ -1,27 +1,7 @@
-from typing import Any, List, Union
-
-from torch import FloatTensor, LongTensor
-from transformers import (
-  LogitsProcessorList,
-  PretrainedConfig,
-  PreTrainedTokenizer,
-  PreTrainedTokenizerBase,
-  PreTrainedTokenizerFast,
-)
+from transformers import LogitsProcessorList, PretrainedConfig
 
 from openllm_core import LLMConfig
 
-Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast]
-
-class StopSequenceCriteria:
-  stop_sequences: List[str]
-  tokenizer: Tokenizer
-  def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ...
-  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
-
-class StopOnTokens:
-  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
-
 def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ...
 def get_context_length(config: PretrainedConfig) -> int: ...
 def is_sentence_complete(output: str) -> bool: ...
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index b08b0e75..31b63d72 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -71,9 +71,7 @@ def normalise_model_name(name: str) -> str:
 
 def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
   if not is_peft_available():
-    raise RuntimeError(
-      "LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
-    )
+    raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'")
   from huggingface_hub import hf_hub_download
 
   resolved: AdapterMap = {}
@@ -285,8 +283,6 @@ class LLM(t.Generic[M, T], ReprMixin):
     if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
     return self.__llm_trust_remote_code__
   @property
-  def runner_name(self):return f"llm-{self.config['start_name']}-runner"
-  @property
   def model_id(self):return self._model_id
   @property
   def revision(self):return self._revision
diff --git a/openllm-python/src/openllm/_llm.pyi b/openllm-python/src/openllm/_llm.pyi
index e47a4294..af88d171 100644
--- a/openllm-python/src/openllm/_llm.pyi
+++ b/openllm-python/src/openllm/_llm.pyi
@@ -97,8 +97,6 @@ class LLM(Generic[M, T]):
   @property
   def trust_remote_code(self) -> bool: ...
   @property
-  def runner_name(self) -> str: ...
-  @property
   def model_id(self) -> str: ...
   @property
   def revision(self) -> str: ...
diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py
index a4293534..a9c72ca9 100644
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -9,7 +9,6 @@ import torch
 import bentoml
 import openllm
 from openllm_core._schemas import CompletionChunk, GenerationOutput, SampleLogprobs
-from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils import ReprMixin, is_ctranslate_available, is_vllm_available
 
 __all__ = ['runner']
@@ -28,12 +27,10 @@ def registry(cls=None, *, alias=None):
 
 
 def runner(llm: openllm.LLM):
-  from ._strategies import CascadingResourceStrategy
-
   try:
-    models = [llm.bentomodel]
-  except bentoml.exceptions.NotFound as err:
-    raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err
+    assert llm.bentomodel
+  except (bentoml.exceptions.NotFound, AssertionError) as err:
+    raise RuntimeError(f'Failed to locate {llm.bentomodel}: {err}') from err
 
   return types.new_class(
     llm.config.__class__.__name__[:-6] + 'Runner',
@@ -73,9 +70,9 @@ def runner(llm: openllm.LLM):
     ),
   )(
     _registry[llm.__llm_backend__],
-    name=llm.runner_name,
-    models=models,
-    scheduling_strategy=CascadingResourceStrategy,
+    name=f"llm-{llm.config['start_name']}-runner",
+    models=[llm.bentomodel],
+    scheduling_strategy=openllm.CascadingResourceStrategy,
     runnable_init_params={'llm': llm},
   )
 
@@ -87,7 +84,7 @@ class CTranslateRunnable(bentoml.Runnable):
 
   def __init__(self, llm):
     if not is_ctranslate_available():
-      raise OpenLLMException('ctranslate is not installed. Please install it with `pip install "openllm[ctranslate]"`')
+      raise openllm.exceptions.OpenLLMException('ctranslate is not installed. Do `pip install "openllm[ctranslate]"`')
     self.llm, self.config, self.model, self.tokenizer = llm, llm.config, llm.model, llm.tokenizer
 
   @bentoml.Runnable.method(batchable=False)
@@ -137,7 +134,7 @@ class vLLMRunnable(bentoml.Runnable):
 
   def __init__(self, llm):
     if not is_vllm_available():
-      raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.')
+      raise openllm.exceptions.OpenLLMException('vLLM is not installed. Do `pip install "openllm[vllm]"`.')
     import vllm
 
     self.llm, self.config, self.tokenizer = llm, llm.config, llm.tokenizer
@@ -162,7 +159,9 @@ class vLLMRunnable(bentoml.Runnable):
       )
     except Exception as err:
       traceback.print_exc()
-      raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
+      raise openllm.exceptions.OpenLLMException(
+        f'Failed to initialise vLLMEngine due to the following error:\n{err}'
+      ) from err
 
   @bentoml.Runnable.method(batchable=False)
   async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py
index 1f1d6b47..7a731a52 100644
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -14,22 +14,13 @@ import psutil
 import bentoml
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
-from openllm_core._typing_compat import overload
 from openllm_core.utils import DEBUG, ReprMixin
 
-
-class DynResource(t.Protocol):
-  resource_id: t.ClassVar[str]
-
-  @classmethod
-  def from_system(cls) -> t.Sequence[t.Any]: ...
-
-
 logger = logging.getLogger(__name__)
 
 
 def _strtoul(s: str) -> int:
-  """Return -1 or positive integer sequence string starts with,."""
+  # Return -1 or positive integer sequence string starts with.
   if not s:
     return -1
   idx = 0
@@ -55,21 +46,6 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
   return rcs
 
 
-_STACK_LEVEL = 3
-
-
-@overload  # variant: default callback
-def _parse_visible_devices() -> list[str] | None: ...
-
-
-@overload  # variant: specify None, and respect_env
-def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: ...
-
-
-@overload  # variant: default var is something other than None
-def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: ...
-
-
 def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
   """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
   if respect_env:
@@ -101,146 +77,136 @@ def _parse_visible_devices(default_var: str | None = None, respect_env: bool = T
   return [str(i) for i in rc]
 
 
-def _from_system(cls: type[DynResource]) -> list[str]:
-  visible_devices = _parse_visible_devices()
-  if visible_devices is None:
-    if cls.resource_id == 'amd.com/gpu':
-      if not psutil.LINUX:
-        if DEBUG:
-          logger.debug('AMD GPUs is currently only supported on Linux.')
-        return []
-      # ROCm does not currently have the rocm_smi wheel.
-      # So we need to use the ctypes bindings directly.
-      # we don't want to use CLI because parsing is a pain.
-      sys.path.append('/opt/rocm/libexec/rocm_smi')
-      try:
-        from ctypes import byref, c_uint32
-
-        # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
-        from rsmiBindings import rocmsmi, rsmi_status_t
-
-        device_count = c_uint32(0)
-        ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
-        if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
-          return [str(i) for i in range(device_count.value)]
-        return []
-      # In this case the binary is not found, returning empty list
-      except (ModuleNotFoundError, ImportError):
-        return []
-      finally:
-        sys.path.remove('/opt/rocm/libexec/rocm_smi')
-    else:
-      try:
-        from cuda import cuda
-
-        cuda.cuInit(0)
-        _, dev = cuda.cuDeviceGetCount()
-        return [str(i) for i in range(dev)]
-      except (ImportError, RuntimeError, AttributeError):
-        return []
-  return visible_devices
-
-
-@overload
-def _from_spec(cls: type[DynResource], spec: int) -> list[str]: ...
-
-
-@overload
-def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: ...
-
-
-@overload
-def _from_spec(cls: type[DynResource], spec: str) -> list[str]: ...
-
-
-def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
-  if isinstance(spec, int):
-    if spec in (-1, 0):
-      return []
-    if spec < -1:
-      raise ValueError('Spec cannot be < -1.')
-    return [str(i) for i in range(spec)]
-  elif isinstance(spec, str):
-    if not spec:
-      return []
-    if spec.isdigit():
-      spec = ','.join([str(i) for i in range(_strtoul(spec))])
-    return _parse_visible_devices(spec, respect_env=False)
-  elif isinstance(spec, list):
-    return [str(x) for x in spec]
-  else:
-    raise TypeError(
-      f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
-    )
-
-
 def _raw_device_uuid_nvml() -> list[str] | None:
   from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
 
   try:
     nvml_h = CDLL('libnvidia-ml.so.1')
   except Exception:
-    warnings.warn('Failed to find nvidia binding', stacklevel=_STACK_LEVEL)
+    warnings.warn('Failed to find nvidia binding', stacklevel=3)
     return None
 
   rc = nvml_h.nvmlInit()
   if rc != 0:
-    warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
+    warnings.warn("Can't initialize NVML", stacklevel=3)
     return None
   dev_count = c_int(-1)
   rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
   if rc != 0:
-    warnings.warn('Failed to get available device from system.', stacklevel=_STACK_LEVEL)
+    warnings.warn('Failed to get available device from system.', stacklevel=3)
     return None
   uuids: list[str] = []
   for idx in range(dev_count.value):
     dev_id = c_void_p()
     rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
     if rc != 0:
-      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=_STACK_LEVEL)
+      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3)
       return None
     buf_len = 96
     buf = create_string_buffer(buf_len)
     rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
     if rc != 0:
-      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=_STACK_LEVEL)
+      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3)
       return None
     uuids.append(buf.raw.decode('ascii').strip('\0'))
   del nvml_h
   return uuids
 
 
-def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
-  if cls.resource_id == 'amd.com/gpu':
-    raise RuntimeError(
-      "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
-    )
-  if not all(isinstance(i, str) for i in val):
-    raise ValueError('Input list should be all string type.')
+class _ResourceMixin:
+  @staticmethod
+  def from_system(cls) -> list[str]:
+    visible_devices = _parse_visible_devices()
+    if visible_devices is None:
+      if cls.resource_id == 'amd.com/gpu':
+        if not psutil.LINUX:
+          if DEBUG:
+            logger.debug('AMD GPUs is currently only supported on Linux.')
+          return []
+        # ROCm does not currently have the rocm_smi wheel.
+        # So we need to use the ctypes bindings directly.
+        # we don't want to use CLI because parsing is a pain.
+        sys.path.append('/opt/rocm/libexec/rocm_smi')
+        try:
+          from ctypes import byref, c_uint32
 
-  try:
-    from cuda import cuda
+          # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
+          from rsmiBindings import rocmsmi, rsmi_status_t
 
-    err, *_ = cuda.cuInit(0)
-    if err != cuda.CUresult.CUDA_SUCCESS:
-      raise RuntimeError('Failed to initialise CUDA runtime binding.')
-    # correctly parse handle
-    for el in val:
-      if el.startswith(('GPU-', 'MIG-')):
-        uuids = _raw_device_uuid_nvml()
-        if uuids is None:
-          raise ValueError('Failed to parse available GPUs UUID')
-        if el not in uuids:
-          raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
-      elif el.isdigit():
-        err, _ = cuda.cuDeviceGet(int(el))
-        if err != cuda.CUresult.CUDA_SUCCESS:
-          raise ValueError(f'Failed to get device {el}')
-  except (ImportError, RuntimeError):
-    pass
+          device_count = c_uint32(0)
+          ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
+          if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
+            return [str(i) for i in range(device_count.value)]
+          return []
+        # In this case the binary is not found, returning empty list
+        except (ModuleNotFoundError, ImportError):
+          return []
+        finally:
+          sys.path.remove('/opt/rocm/libexec/rocm_smi')
+      else:
+        try:
+          from cuda import cuda
+
+          cuda.cuInit(0)
+          _, dev = cuda.cuDeviceGetCount()
+          return [str(i) for i in range(dev)]
+        except (ImportError, RuntimeError, AttributeError):
+          return []
+    return visible_devices
+
+  @staticmethod
+  def from_spec(cls, spec) -> list[str]:
+    if isinstance(spec, int):
+      if spec in (-1, 0):
+        return []
+      if spec < -1:
+        raise ValueError('Spec cannot be < -1.')
+      return [str(i) for i in range(spec)]
+    elif isinstance(spec, str):
+      if not spec:
+        return []
+      if spec.isdigit():
+        spec = ','.join([str(i) for i in range(_strtoul(spec))])
+      return _parse_visible_devices(spec, respect_env=False)
+    elif isinstance(spec, list):
+      return [str(x) for x in spec]
+    else:
+      raise TypeError(
+        f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
+      )
+
+  @staticmethod
+  def validate(cls, val: list[t.Any]) -> None:
+    if cls.resource_id == 'amd.com/gpu':
+      raise RuntimeError(
+        "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
+      )
+    if not all(isinstance(i, str) for i in val):
+      raise ValueError('Input list should be all string type.')
+
+    try:
+      from cuda import cuda
+
+      err, *_ = cuda.cuInit(0)
+      if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      # correctly parse handle
+      for el in val:
+        if el.startswith(('GPU-', 'MIG-')):
+          uuids = _raw_device_uuid_nvml()
+          if uuids is None:
+            raise ValueError('Failed to parse available GPUs UUID')
+          if el not in uuids:
+            raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
+        elif el.isdigit():
+          err, _ = cuda.cuDeviceGet(int(el))
+          if err != cuda.CUresult.CUDA_SUCCESS:
+            raise ValueError(f'Failed to get device {el}')
+    except (ImportError, RuntimeError):
+      pass
 
 
-def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
   return types.new_class(
     name,
     (bentoml.Resource[t.List[str]], ReprMixin),
@@ -248,9 +214,9 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
     lambda ns: ns.update(
       {
         'resource_id': resource_kind,
-        'from_spec': classmethod(_from_spec),
-        'from_system': classmethod(_from_system),
-        'validate': classmethod(_validate),
+        'from_spec': classmethod(_ResourceMixin.from_spec),
+        'from_system': classmethod(_ResourceMixin.from_system),
+        'validate': classmethod(_ResourceMixin.validate),
         '__repr_keys__': property(lambda _: {'resource_id'}),
         '__doc__': inspect.cleandoc(docstring),
         '__module__': 'openllm._strategies',
@@ -259,15 +225,9 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
   )
 
 
-# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal 🤦
-_TPU_RESOURCE: t.Literal['cloud-tpus.google.com/v2'] = 'cloud-tpus.google.com/v2'
-_AMD_GPU_RESOURCE: t.Literal['amd.com/gpu'] = 'amd.com/gpu'
-_NVIDIA_GPU_RESOURCE: t.Literal['nvidia.com/gpu'] = 'nvidia.com/gpu'
-_CPU_RESOURCE: t.Literal['cpu'] = 'cpu'
-
 NvidiaGpuResource = _make_resource_class(
   'NvidiaGpuResource',
-  _NVIDIA_GPU_RESOURCE,
+  'nvidia.com/gpu',
   """NVIDIA GPU resource.
 
     This is a modified version of internal's BentoML's NvidiaGpuResource
@@ -275,7 +235,7 @@ NvidiaGpuResource = _make_resource_class(
 )
 AmdGpuResource = _make_resource_class(
   'AmdGpuResource',
-  _AMD_GPU_RESOURCE,
+  'amd.com/gpu',
   """AMD GPU resource.
 
     Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py
index 91a95db4..a10cf35b 100644
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -1,9 +1,6 @@
+# fmt: off
 import openllm_client as _client
 
 
-def __dir__():
-  return sorted(dir(_client))
-
-
-def __getattr__(it):
-  return getattr(_client, it)
+def __dir__():return sorted(dir(_client))
+def __getattr__(it):return getattr(_client, it)
diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py
deleted file mode 100644
index f51355bf..00000000
--- a/openllm-python/src/openllm/testing.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from __future__ import annotations
-import contextlib
-import logging
-import shutil
-import subprocess
-import typing as t
-
-import bentoml
-import openllm
-
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralBackend, LiteralQuantise
-
-logger = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def build_bento(
-  model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False
-) -> t.Iterator[bentoml.Bento]:
-  logger.info('Building BentoML for %s', model)
-  bento = openllm.build(model, model_id=model_id, quantize=quantize)
-  yield bento
-  if cleanup:
-    logger.info('Deleting %s', bento.tag)
-    bentoml.bentos.delete(bento.tag)
-
-
-@contextlib.contextmanager
-def build_container(
-  bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any
-) -> t.Iterator[str]:
-  if isinstance(bento, bentoml.Bento):
-    bento_tag = bento.tag
-  else:
-    bento_tag = bentoml.Tag.from_taglike(bento)
-  if image_tag is None:
-    image_tag = str(bento_tag)
-  executable = shutil.which('docker')
-  if not executable:
-    raise RuntimeError('docker executable not found')
-  try:
-    logger.info('Building container for %s', bento_tag)
-    bentoml.container.build(bento_tag, backend='docker', image_tag=(image_tag,), progress='plain', **attrs)
-    yield image_tag
-  finally:
-    if cleanup:
-      logger.info('Deleting container %s', image_tag)
-      subprocess.check_output([executable, 'rmi', '-f', image_tag])
-
-
-@contextlib.contextmanager
-def prepare(
-  model: str,
-  model_id: str,
-  backend: LiteralBackend = 'pt',
-  deployment_mode: t.Literal['container', 'local'] = 'local',
-  clean_context: contextlib.ExitStack | None = None,
-  cleanup: bool = True,
-) -> t.Iterator[str]:
-  if clean_context is None:
-    clean_context = contextlib.ExitStack()
-    cleanup = True
-  llm = openllm.LLM[t.Any, t.Any](model_id, backend=backend)
-  bento_tag = bentoml.Tag.from_taglike(f'{llm.llm_type}-service:{llm.tag.version}')
-  if not bentoml.list(bento_tag):
-    bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup))
-  else:
-    bento = bentoml.get(bento_tag)
-  container_name = f'openllm-{model}-{llm.llm_type}'.replace('-', '_')
-  if deployment_mode == 'container':
-    container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup))
-  yield container_name
-  if cleanup:
-    clean_context.close()
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils.py
similarity index 100%
rename from openllm-python/src/openllm/utils/__init__.py
rename to openllm-python/src/openllm/utils.py
diff --git a/openllm-python/src/openllm/utils/__init__.pyi b/openllm-python/src/openllm/utils.pyi
similarity index 98%
rename from openllm-python/src/openllm/utils/__init__.pyi
rename to openllm-python/src/openllm/utils.pyi
index ae535687..f5627aec 100644
--- a/openllm-python/src/openllm/utils/__init__.pyi
+++ b/openllm-python/src/openllm/utils.pyi
@@ -44,7 +44,6 @@ from openllm_core.utils import (
   is_transformers_available as is_transformers_available,
   is_vllm_available as is_vllm_available,
   lenient_issubclass as lenient_issubclass,
-  reserve_free_port as reserve_free_port,
   resolve_filepath as resolve_filepath,
   resolve_user_filepath as resolve_user_filepath,
   serde as serde,
diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py
index 3ebd8374..26bf488b 100644
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -21,7 +21,7 @@ from openllm_core._typing_compat import (
   ParamSpec,
   get_literal_args,
 )
-from openllm_core.utils import DEBUG, resolve_user_filepath
+from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath
 
 
 class _OpenLLM_GenericInternalConfig(LLMConfig):
@@ -134,7 +134,7 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
 
 def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
   def wrapper(fn: FC) -> t.Callable[[FC], FC]:
-    composed = openllm.utils.compose(
+    composed = compose(
       _OpenLLM_GenericInternalConfig.parse,
       _http_server_args if not serve_grpc else _grpc_server_args,
       cog.optgroup.group('General LLM Options', help='The following options are related to running LLM Server.'),
@@ -160,7 +160,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
       serialisation_option(factory=cog.optgroup),
       cog.optgroup.option(
         '--device',
-        type=openllm.utils.dantic.CUDA,
+        type=dantic.CUDA,
         multiple=True,
         envvar='CUDA_VISIBLE_DEVICES',
         callback=parse_device_callback,
diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py
index d868cbff..6fa1154f 100644
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -294,7 +294,7 @@ def _list_models() -> dict[str, t.Any]:
   """List all available models within the local store."""
   from .entrypoint import models_command
 
-  return models_command.main(args=['--show-available', '--quiet'], standalone_mode=False)
+  return models_command.main(args=['--quiet'], standalone_mode=False)
 
 
 start, start_grpc = codegen.gen_sdk(_start, _serve_grpc=False), codegen.gen_sdk(_start, _serve_grpc=True)
diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py
index 86b31185..4f0286cc 100644
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -1,25 +1,3 @@
-"""OpenLLM CLI interface.
-
-This module also contains the SDK to call ``start`` and ``build`` from SDK
-
-Start any LLM:
-
-```python
-openllm.start('mistral', model_id='mistralai/Mistral-7B-v0.1')
-```
-
-Build a BentoLLM
-
-```python
-bento = openllm.build('mistralai/Mistral-7B-v0.1')
-```
-
-Import any LLM into local store
-```python
-bentomodel = openllm.import_model('mistralai/Mistral-7B-v0.1')
-```
-"""
-
 from __future__ import annotations
 import enum
 import functools
@@ -91,7 +69,6 @@ from openllm_core.utils import (
 from . import termui
 from ._factory import (
   FC,
-  LiteralOutput,
   _AnyCallable,
   backend_option,
   container_registry_option,
@@ -1225,7 +1202,11 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
 @model_name_argument(required=False)
 @click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
 @click.option(
-  '--include-bentos/--no-include-bentos', is_flag=True, default=False, help='Whether to also include pruning bentos.'
+  '--include-bentos/--no-include-bentos',
+  is_flag=True,
+  hidden=True,
+  default=True,
+  help='Whether to also include pruning bentos.',
 )
 @inject
 @click.pass_context
@@ -1233,11 +1214,11 @@ def prune_command(
   ctx: click.Context,
   model_name: str | None,
   yes: bool,
-  include_bentos: bool,
   model_store: ModelStore = Provide[BentoMLContainer.model_store],
   bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
+  **_: t.Any,
 ) -> None:
-  """Remove all saved models, (and optionally bentos) built with OpenLLM locally.
+  """Remove all saved models, and bentos built with OpenLLM locally.
 
   \b
   If a model type is passed, then only prune models for that given model type.
@@ -1252,18 +1233,15 @@ def prune_command(
       (m, store)
       for m, store in available
       if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
+    ] + [
+      (b, bento_store)
+      for b in bentoml.bentos.list()
+      if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)
+    ]
+  if model_name is None:
+    available += [
+      (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels
     ]
-  if include_bentos:
-    if model_name is not None:
-      available += [
-        (b, bento_store)
-        for b in bentoml.bentos.list()
-        if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)
-      ]
-    else:
-      available += [
-        (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels
-      ]
 
   for store_item, store in available:
     if yes:
@@ -1316,69 +1294,6 @@ def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC]
   return compose(*options)(f) if f is not None else compose(*options)
 
 
-@cli.command(hidden=True)
-@click.argument('task', type=click.STRING, metavar='TASK')
-@shared_client_options
-@click.option(
-  '--agent',
-  type=click.Choice(['hf']),
-  default='hf',
-  help='Whether to interact with Agents from given Server endpoint.',
-  show_default=True,
-)
-@click.option(
-  '--remote',
-  is_flag=True,
-  default=False,
-  help='Whether or not to use remote tools (inference endpoints) instead of local ones.',
-  show_default=True,
-)
-@click.option(
-  '--opt',
-  help="Define prompt options. (format: ``--opt text='I love this' --opt audio:./path/to/audio  --opt image:/path/to/file``)",
-  required=False,
-  multiple=True,
-  callback=opt_callback,
-  metavar='ARG=VALUE[,ARG=VALUE]',
-)
-def instruct_command(
-  endpoint: str,
-  timeout: int,
-  agent: LiteralString,
-  output: LiteralOutput,
-  remote: bool,
-  task: str,
-  _memoized: DictStrAny,
-  **attrs: t.Any,
-) -> str:
-  """Instruct agents interactively for given tasks, from a terminal.
-
-  \b
-  ```bash
-  $ openllm instruct --endpoint http://12.323.2.1:3000 \\
-        "Is the following `text` (in Spanish) positive or negative?" \\
-        --text "¡Este es un API muy agradable!"
-  ```
-  """
-  raise click.ClickException("'instruct' is currently disabled")
-  # client = openllm.client.HTTPClient(endpoint, timeout=timeout)
-  #
-  # try:
-  #   client.call('metadata')
-  # except http.client.BadStatusLine:
-  #   raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
-  # if agent == 'hf':
-  #   _memoized = {k: v[0] for k, v in _memoized.items() if v}
-  #   client._hf_agent.set_stream(logger.info)
-  #   if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')
-  #   result = client.ask_agent(task, agent_type=agent, return_code=False, remote=remote, **_memoized)
-  #   if output == 'json': termui.echo(orjson.dumps(result, option=orjson.OPT_INDENT_2).decode(), fg='white')
-  #   else: termui.echo(result, fg='white')
-  #   return result
-  # else:
-  #   raise click.BadOptionUsage('agent', f'Unknown agent type {agent}')
-
-
 @cli.command()
 @shared_client_options
 @click.option(
diff --git a/openllm-python/tests/models/__init__.py b/openllm-python/tests/models/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json b/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json
deleted file mode 100644
index 38506cbd..00000000
--- a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[container].json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "configuration": {
-    "generation_config": {
-      "diversity_penalty": 0.0,
-      "early_stopping": false,
-      "encoder_no_repeat_ngram_size": 0,
-      "encoder_repetition_penalty": 1.0,
-      "epsilon_cutoff": 0.0,
-      "eta_cutoff": 0.0,
-      "length_penalty": 1.0,
-      "max_new_tokens": 10,
-      "min_length": 0,
-      "no_repeat_ngram_size": 0,
-      "num_beam_groups": 1,
-      "num_beams": 1,
-      "num_return_sequences": 1,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "output_scores": false,
-      "remove_invalid_values": false,
-      "renormalize_logits": false,
-      "repetition_penalty": 1.0,
-      "temperature": 0.9,
-      "top_k": 50,
-      "top_p": 0.9,
-      "typical_p": 1.0,
-      "use_cache": true
-    }
-  },
-  "responses": [
-    "life is a complete physical life"
-  ]
-}
\ No newline at end of file
diff --git a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json b/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json
deleted file mode 100644
index 6f1deb95..00000000
--- a/openllm-python/tests/models/__snapshots__/flan_t5_test/test_flan_t5[local].json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "configuration": {
-    "generation_config": {
-      "diversity_penalty": 0.0,
-      "early_stopping": false,
-      "encoder_no_repeat_ngram_size": 0,
-      "encoder_repetition_penalty": 1.0,
-      "epsilon_cutoff": 0.0,
-      "eta_cutoff": 0.0,
-      "length_penalty": 1.0,
-      "max_new_tokens": 10,
-      "min_length": 0,
-      "no_repeat_ngram_size": 0,
-      "num_beam_groups": 1,
-      "num_beams": 1,
-      "num_return_sequences": 1,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "output_scores": false,
-      "remove_invalid_values": false,
-      "renormalize_logits": false,
-      "repetition_penalty": 1.0,
-      "temperature": 0.9,
-      "top_k": 50,
-      "top_p": 0.9,
-      "typical_p": 1.0,
-      "use_cache": true
-    }
-  },
-  "responses": [
-    "life is a state"
-  ]
-}
\ No newline at end of file
diff --git a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json b/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json
deleted file mode 100644
index 0727c509..00000000
--- a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[container].json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "configuration": {
-    "format_outputs": false,
-    "generation_config": {
-      "diversity_penalty": 0.0,
-      "early_stopping": false,
-      "encoder_no_repeat_ngram_size": 0,
-      "encoder_repetition_penalty": 1.0,
-      "epsilon_cutoff": 0.0,
-      "eta_cutoff": 0.0,
-      "length_penalty": 1.0,
-      "max_new_tokens": 20,
-      "min_length": 0,
-      "no_repeat_ngram_size": 0,
-      "num_beam_groups": 1,
-      "num_beams": 1,
-      "num_return_sequences": 1,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "output_scores": false,
-      "remove_invalid_values": false,
-      "renormalize_logits": false,
-      "repetition_penalty": 1.0,
-      "temperature": 0.75,
-      "top_k": 15,
-      "top_p": 1.0,
-      "typical_p": 1.0,
-      "use_cache": true
-    }
-  },
-  "responses": [
-    "What is Deep learning?\nDeep learning is a new way of studying the content and making an informed decision. It is the"
-  ]
-}
\ No newline at end of file
diff --git a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json b/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json
deleted file mode 100644
index b17a783d..00000000
--- a/openllm-python/tests/models/__snapshots__/opt_test/test_opt_125m[local].json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "configuration": {
-    "format_outputs": false,
-    "generation_config": {
-      "diversity_penalty": 0.0,
-      "early_stopping": false,
-      "encoder_no_repeat_ngram_size": 0,
-      "encoder_repetition_penalty": 1.0,
-      "epsilon_cutoff": 0.0,
-      "eta_cutoff": 0.0,
-      "length_penalty": 1.0,
-      "max_new_tokens": 20,
-      "min_length": 0,
-      "no_repeat_ngram_size": 0,
-      "num_beam_groups": 1,
-      "num_beams": 1,
-      "num_return_sequences": 1,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "output_scores": false,
-      "remove_invalid_values": false,
-      "renormalize_logits": false,
-      "repetition_penalty": 1.0,
-      "temperature": 0.75,
-      "top_k": 15,
-      "top_p": 1.0,
-      "typical_p": 1.0,
-      "use_cache": true
-    }
-  },
-  "responses": [
-    "What is Deep learning?\n\nDeep learning is a new, highly-advanced, and powerful tool for the deep learning"
-  ]
-}
\ No newline at end of file
diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py
deleted file mode 100644
index 205dca20..00000000
--- a/openllm-python/tests/models/conftest.py
+++ /dev/null
@@ -1,266 +0,0 @@
-from __future__ import annotations
-import asyncio
-import contextlib
-import functools
-import logging
-import sys
-import time
-import typing as t
-from abc import ABC, abstractmethod
-
-import attr
-import docker
-import docker.errors
-import docker.types
-import orjson
-import pytest
-from syrupy.extensions.json import JSONSnapshotExtension
-
-import openllm
-from bentoml._internal.types import LazyType
-from openllm._llm import self
-from openllm_core._typing_compat import DictStrAny, ListAny, LiteralQuantise
-
-logger = logging.getLogger(__name__)
-
-if t.TYPE_CHECKING:
-  import subprocess
-
-  from syrupy.assertion import SnapshotAssertion
-  from syrupy.types import PropertyFilter, PropertyMatcher, SerializableData, SerializedData
-
-  from openllm.client import BaseAsyncClient
-
-
-class ResponseComparator(JSONSnapshotExtension):
-  def serialize(
-    self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None
-  ) -> SerializedData:
-    if LazyType(ListAny).isinstance(data):
-      data = [d.unmarshaled for d in data]
-    else:
-      data = data.unmarshaled
-    data = self._filter(data=data, depth=0, path=(), exclude=exclude, matcher=matcher)
-    return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()
-
-  def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool:
-    def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]:
-      try:
-        data = orjson.loads(data)
-      except orjson.JSONDecodeError as err:
-        raise ValueError(f'Failed to decode JSON data: {data}') from err
-      if LazyType(DictStrAny).isinstance(data):
-        return openllm.GenerationOutput(**data)
-      elif LazyType(ListAny).isinstance(data):
-        return [openllm.GenerationOutput(**d) for d in data]
-      else:
-        raise NotImplementedError(f'Data {data} has unsupported type.')
-
-    serialized_data = convert_data(serialized_data)
-    snapshot_data = convert_data(snapshot_data)
-
-    if LazyType(ListAny).isinstance(serialized_data):
-      serialized_data = [serialized_data]
-    if LazyType(ListAny).isinstance(snapshot_data):
-      snapshot_data = [snapshot_data]
-
-    def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool:
-      return len(s.outputs) == len(t.outputs)
-
-    return len(serialized_data) == len(snapshot_data) and all(
-      [eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]
-    )
-
-
-@pytest.fixture()
-def response_snapshot(snapshot: SnapshotAssertion):
-  return snapshot.use_extension(ResponseComparator)
-
-
-@attr.define(init=False)
-class _Handle(ABC):
-  port: int
-  deployment_mode: t.Literal['container', 'local']
-
-  client: BaseAsyncClient[t.Any] = attr.field(init=False)
-
-  if t.TYPE_CHECKING:
-
-    def __attrs_init__(self, *args: t.Any, **attrs: t.Any): ...
-
-  def __attrs_post_init__(self):
-    self.client = openllm.client.AsyncHTTPClient(f'http://localhost:{self.port}')
-
-  @abstractmethod
-  def status(self) -> bool:
-    raise NotImplementedError
-
-  async def health(self, timeout: int = 240):
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-      if not self.status():
-        raise RuntimeError(f'Failed to initialise {self.__class__.__name__}')
-      await self.client.health()
-      try:
-        await self.client.query('sanity')
-        return
-      except Exception:
-        time.sleep(1)
-    raise RuntimeError(f'Handle failed to initialise within {timeout} seconds.')
-
-
-@attr.define(init=False)
-class LocalHandle(_Handle):
-  process: subprocess.Popen[bytes]
-
-  def __init__(self, process: subprocess.Popen[bytes], port: int, deployment_mode: t.Literal['container', 'local']):
-    self.__attrs_init__(port, deployment_mode, process)
-
-  def status(self) -> bool:
-    return self.process.poll() is None
-
-
-class HandleProtocol(t.Protocol):
-  @contextlib.contextmanager
-  def __call__(
-    *, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None
-  ) -> t.Generator[_Handle, None, None]: ...
-
-
-@attr.define(init=False)
-class DockerHandle(_Handle):
-  container_name: str
-  docker_client: docker.DockerClient
-
-  def __init__(
-    self,
-    docker_client: docker.DockerClient,
-    container_name: str,
-    port: int,
-    deployment_mode: t.Literal['container', 'local'],
-  ):
-    self.__attrs_init__(port, deployment_mode, container_name, docker_client)
-
-  def status(self) -> bool:
-    container = self.docker_client.containers.get(self.container_name)
-    return container.status in ['running', 'created']
-
-
-@contextlib.contextmanager
-def _local_handle(
-  model: str,
-  model_id: str,
-  image_tag: str,
-  deployment_mode: t.Literal['container', 'local'],
-  quantize: LiteralQuantise | None = None,
-  *,
-  _serve_grpc: bool = False,
-):
-  with openllm.utils.reserve_free_port() as port:
-    pass
-
-  if not _serve_grpc:
-    proc = openllm.start(
-      model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True
-    )
-  else:
-    proc = openllm.start_grpc(
-      model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True
-    )
-
-  yield LocalHandle(proc, port, deployment_mode)
-  proc.terminate()
-  proc.wait(60)
-
-  process_output = proc.stdout.read()
-  print(process_output, file=sys.stderr)
-
-  proc.stdout.close()
-  if proc.stderr:
-    proc.stderr.close()
-
-
-@contextlib.contextmanager
-def _container_handle(
-  model: str,
-  model_id: str,
-  image_tag: str,
-  deployment_mode: t.Literal['container', 'local'],
-  quantize: LiteralQuantise | None = None,
-  *,
-  _serve_grpc: bool = False,
-):
-  with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
-    pass
-  container_name = f'openllm-{model}-{self(model_id)}'.replace('-', '_')
-  client = docker.from_env()
-  try:
-    container = client.containers.get(container_name)
-    container.stop()
-    container.wait()
-    container.remove()
-  except docker.errors.NotFound:
-    pass
-
-  args = ['serve' if not _serve_grpc else 'serve-grpc']
-
-  env: DictStrAny = {}
-
-  if quantize is not None:
-    env['OPENLLM_QUANTIZE'] = quantize
-
-  gpus = openllm.utils.device_count() or -1
-  devs = [docker.types.DeviceRequest(count=gpus, capabilities=[['gpu']])] if gpus > 0 else None
-
-  container = client.containers.run(
-    image_tag,
-    command=args,
-    name=container_name,
-    environment=env,
-    auto_remove=False,
-    detach=True,
-    device_requests=devs,
-    ports={'3000/tcp': port, '3001/tcp': prom_port},
-  )
-
-  yield DockerHandle(client, container.name, port, deployment_mode)
-
-  try:
-    container.stop()
-    container.wait()
-  except docker.errors.NotFound:
-    pass
-
-  container_output = container.logs().decode('utf-8')
-  print(container_output, file=sys.stderr)
-
-  container.remove()
-
-
-@pytest.fixture(scope='session', autouse=True)
-def clean_context() -> t.Generator[contextlib.ExitStack, None, None]:
-  stack = contextlib.ExitStack()
-  yield stack
-  stack.close()
-
-
-@pytest.fixture(scope='module')
-def el() -> t.Generator[asyncio.AbstractEventLoop, None, None]:
-  loop = asyncio.get_event_loop()
-  yield loop
-  loop.close()
-
-
-@pytest.fixture(params=['container', 'local'], scope='session')
-def deployment_mode(request: pytest.FixtureRequest) -> str:
-  return request.param
-
-
-@pytest.fixture(scope='module')
-def handler(el: asyncio.AbstractEventLoop, deployment_mode: t.Literal['container', 'local']):
-  if deployment_mode == 'container':
-    return functools.partial(_container_handle, deployment_mode=deployment_mode)
-  elif deployment_mode == 'local':
-    return functools.partial(_local_handle, deployment_mode=deployment_mode)
-  else:
-    raise ValueError(f'Unknown deployment mode: {deployment_mode}')
diff --git a/openllm-python/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py
deleted file mode 100644
index 2b962703..00000000
--- a/openllm-python/tests/models/flan_t5_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import pytest
-
-import openllm
-
-if t.TYPE_CHECKING:
-  import contextlib
-
-  from .conftest import HandleProtocol, ResponseComparator, _Handle
-
-model = 'flan_t5'
-model_id = 'google/flan-t5-small'
-
-
-@pytest.fixture(scope='module')
-def flan_t5_handle(
-  handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack
-):
-  with openllm.testing.prepare(
-    model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context
-  ) as image_tag:
-    with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
-      yield handle
-
-
-@pytest.fixture(scope='module')
-async def flan_t5(flan_t5_handle: _Handle):
-  await flan_t5_handle.health(240)
-  return flan_t5_handle.client
-
-
-@pytest.mark.asyncio()
-async def test_flan_t5(flan_t5: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator):
-  client = await flan_t5
-  response = await client.query('What is the meaning of life?', max_new_tokens=10, top_p=0.9, return_response='attrs')
-
-  assert response.configuration['generation_config']['max_new_tokens'] == 10
-  assert response == response_snapshot
diff --git a/openllm-python/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py
deleted file mode 100644
index 3ab5befa..00000000
--- a/openllm-python/tests/models/opt_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from __future__ import annotations
-import typing as t
-
-import pytest
-
-import openllm
-
-if t.TYPE_CHECKING:
-  import contextlib
-
-  from .conftest import HandleProtocol, ResponseComparator, _Handle
-
-model = 'opt'
-model_id = 'facebook/opt-125m'
-
-
-@pytest.fixture(scope='module')
-def opt_125m_handle(
-  handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack
-):
-  with openllm.testing.prepare(
-    model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context
-  ) as image_tag:
-    with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
-      yield handle
-
-
-@pytest.fixture(scope='module')
-async def opt_125m(opt_125m_handle: _Handle):
-  await opt_125m_handle.health(240)
-  return opt_125m_handle.client
-
-
-@pytest.mark.asyncio()
-async def test_opt_125m(opt_125m: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator):
-  client = await opt_125m
-  response = await client.query('What is Deep learning?', max_new_tokens=20, return_response='attrs')
-
-  assert response.configuration['generation_config']['max_new_tokens'] == 20
-  assert response == response_snapshot