refactor: delete unused code (#716)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-02-18 22:55:08 -05:00 · 2023-11-21 04:39:48 -05:00
parent a8a9f154ce
commit c33b071ee4
28 changed files with 149 additions and 957 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -3,14 +3,12 @@ import os as _os
 import pathlib as _pathlib
 import warnings as _warnings

-import openllm_cli as _cli
 from openllm_cli import _sdk

 from . import utils as utils

 if utils.DEBUG:
  utils.set_debug_mode(True)
-  utils.set_quiet_mode(False)
  _logging.basicConfig(level=_logging.NOTSET)
 else:
  # configuration for bitsandbytes before import
@@ -47,18 +45,9 @@ __lazy = utils.LazyModule(
    'serialisation': ['ggml', 'transformers'],
    '_quantisation': ['infer_quantisation_config'],
    '_llm': ['LLM'],
-    '_generation': [
-      'StopSequenceCriteria',
-      'StopOnTokens',
-      'prepare_logits_processor',
-      'get_context_length',
-      'is_sentence_complete',
-      'is_partial_stop',
-    ],
  },
  extra_objects={
    'COMPILED': COMPILED,
-    'cli': _cli,
    'start': _sdk.start,
    'start_grpc': _sdk.start_grpc,
    'build': _sdk.build,
--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -16,7 +16,6 @@ from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_N
 # update-config-stubs.py: import stubs stop
 # fmt: on

-import openllm_cli as _cli
 from openllm_cli._sdk import (
  build as build,
  import_model as import_model,
@@ -44,14 +43,6 @@ from . import (
  utils as utils,
 )
 from ._deprecated import Runner as Runner
-from ._generation import (
-  StopOnTokens as StopOnTokens,
-  StopSequenceCriteria as StopSequenceCriteria,
-  prepare_logits_processor as prepare_logits_processor,
-  is_partial_stop as is_partial_stop,
-  is_sentence_complete as is_sentence_complete,
-  get_context_length as get_context_length,
-)
 from ._llm import LLM as LLM
 from ._quantisation import infer_quantisation_config as infer_quantisation_config
 from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
@@ -60,5 +51,4 @@ from .entrypoints import mount_entrypoints as mount_entrypoints
 from .protocol import openai as openai
 from .serialisation import ggml as ggml, transformers as transformers

-cli = _cli
 COMPILED: bool = ...
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -1,13 +1,2 @@
-"""CLI entrypoint for OpenLLM.
-
-Usage:
-    openllm --help
-
-To start any OpenLLM model:
-    openllm start <model_name> --options ...
-"""
-
-if __name__ == '__main__':
-  from openllm_cli.entrypoint import cli
-
-  cli()
+# fmt: off
+if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()  # noqa
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -6,7 +6,10 @@ import warnings

 import openllm
 from openllm_core._typing_compat import LiteralBackend, ParamSpec
-from openllm_core.utils import first_not_none, is_vllm_available
+from openllm_core.utils import first_not_none, getenv, is_vllm_available
+
+if t.TYPE_CHECKING:
+  from ._runners import Runner as _Runner

 P = ParamSpec('P')

@@ -20,7 +23,7 @@ def Runner(
  backend: LiteralBackend | None = None,
  llm_config: openllm.LLMConfig | None = None,
  **attrs: t.Any,
-) -> openllm.LLMRunner[t.Any, t.Any]:
+) -> _Runner[t.Any, t.Any]:
  """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.

  > [!WARNING]
@@ -73,9 +76,9 @@ def Runner(
  attrs.update(
    {
      'model_id': model_id,
-      'quantize': os.getenv('OPENLLM_QUANTIZE', attrs.get('quantize', None)),
-      'serialisation': first_not_none(
-        attrs.get('serialisation'), os.environ.get('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']
+      'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),
+      'serialisation': getenv(
+        'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
      ),
    }
  )
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -1,23 +1,6 @@
 import transformers


-class StopSequenceCriteria(transformers.StoppingCriteria):
-  def __init__(self, stop_sequences, tokenizer):
-    if isinstance(stop_sequences, str):
-      stop_sequences = [stop_sequences]
-    self.stop_sequences, self.tokenizer = stop_sequences, tokenizer
-
-  def __call__(self, input_ids, scores, **kwargs):
-    return any(
-      self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences
-    )
-
-
-class StopOnTokens(transformers.StoppingCriteria):
-  def __call__(self, input_ids, scores, **kwargs):
-    return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
-
-
 def prepare_logits_processor(config):
  generation_config = config.generation_config
  logits_processor = transformers.LogitsProcessorList()
--- a/openllm-python/src/openllm/_generation.pyi
+++ b/openllm-python/src/openllm/_generation.pyi
@@ -1,27 +1,7 @@
-from typing import Any, List, Union
-
-from torch import FloatTensor, LongTensor
-from transformers import (
-  LogitsProcessorList,
-  PretrainedConfig,
-  PreTrainedTokenizer,
-  PreTrainedTokenizerBase,
-  PreTrainedTokenizerFast,
-)
+from transformers import LogitsProcessorList, PretrainedConfig

 from openllm_core import LLMConfig

-Tokenizer = Union[PreTrainedTokenizerBase, PreTrainedTokenizer, PreTrainedTokenizerFast]
-
-class StopSequenceCriteria:
-  stop_sequences: List[str]
-  tokenizer: Tokenizer
-  def __init__(self, stop_sequences: Union[str, List[str]], tokenizer: Tokenizer) -> None: ...
-  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
-
-class StopOnTokens:
-  def __call__(self, input_ids: LongTensor, scores: FloatTensor, **kwargs: Any) -> bool: ...
-
 def prepare_logits_processor(config: LLMConfig) -> LogitsProcessorList: ...
 def get_context_length(config: PretrainedConfig) -> int: ...
 def is_sentence_complete(output: str) -> bool: ...
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -71,9 +71,7 @@ def normalise_model_name(name: str) -> str:

 def _resolve_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
  if not is_peft_available():
-    raise RuntimeError(
-      "LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
-    )
+    raise RuntimeError("Requires 'peft' to be installed. Do 'pip install \"openllm[fine-tune]\"'")
  from huggingface_hub import hf_hub_download

  resolved: AdapterMap = {}
@@ -285,8 +283,6 @@ class LLM(t.Generic[M, T], ReprMixin):
    if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
    return self.__llm_trust_remote_code__
  @property
-  def runner_name(self):return f"llm-{self.config['start_name']}-runner"
-  @property
  def model_id(self):return self._model_id
  @property
  def revision(self):return self._revision
--- a/openllm-python/src/openllm/_llm.pyi
+++ b/openllm-python/src/openllm/_llm.pyi
@@ -97,8 +97,6 @@ class LLM(Generic[M, T]):
  @property
  def trust_remote_code(self) -> bool: ...
  @property
-  def runner_name(self) -> str: ...
-  @property
  def model_id(self) -> str: ...
  @property
  def revision(self) -> str: ...
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -9,7 +9,6 @@ import torch
 import bentoml
 import openllm
 from openllm_core._schemas import CompletionChunk, GenerationOutput, SampleLogprobs
-from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils import ReprMixin, is_ctranslate_available, is_vllm_available

 __all__ = ['runner']
@@ -28,12 +27,10 @@ def registry(cls=None, *, alias=None):


 def runner(llm: openllm.LLM):
-  from ._strategies import CascadingResourceStrategy
-
  try:
-    models = [llm.bentomodel]
-  except bentoml.exceptions.NotFound as err:
-    raise RuntimeError(f'Failed to locate {llm.bentomodel}:{err}') from err
+    assert llm.bentomodel
+  except (bentoml.exceptions.NotFound, AssertionError) as err:
+    raise RuntimeError(f'Failed to locate {llm.bentomodel}: {err}') from err

  return types.new_class(
    llm.config.__class__.__name__[:-6] + 'Runner',
@@ -73,9 +70,9 @@ def runner(llm: openllm.LLM):
    ),
  )(
    _registry[llm.__llm_backend__],
-    name=llm.runner_name,
-    models=models,
-    scheduling_strategy=CascadingResourceStrategy,
+    name=f"llm-{llm.config['start_name']}-runner",
+    models=[llm.bentomodel],
+    scheduling_strategy=openllm.CascadingResourceStrategy,
    runnable_init_params={'llm': llm},
  )

@@ -87,7 +84,7 @@ class CTranslateRunnable(bentoml.Runnable):

  def __init__(self, llm):
    if not is_ctranslate_available():
-      raise OpenLLMException('ctranslate is not installed. Please install it with `pip install "openllm[ctranslate]"`')
+      raise openllm.exceptions.OpenLLMException('ctranslate is not installed. Do `pip install "openllm[ctranslate]"`')
    self.llm, self.config, self.model, self.tokenizer = llm, llm.config, llm.model, llm.tokenizer

  @bentoml.Runnable.method(batchable=False)
@@ -137,7 +134,7 @@ class vLLMRunnable(bentoml.Runnable):

  def __init__(self, llm):
    if not is_vllm_available():
-      raise OpenLLMException('vLLM is not installed. Please install it via `pip install "openllm[vllm]"`.')
+      raise openllm.exceptions.OpenLLMException('vLLM is not installed. Do `pip install "openllm[vllm]"`.')
    import vllm

    self.llm, self.config, self.tokenizer = llm, llm.config, llm.tokenizer
@@ -162,7 +159,9 @@ class vLLMRunnable(bentoml.Runnable):
      )
    except Exception as err:
      traceback.print_exc()
-      raise OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
+      raise openllm.exceptions.OpenLLMException(
+        f'Failed to initialise vLLMEngine due to the following error:\n{err}'
+      ) from err

  @bentoml.Runnable.method(batchable=False)
  async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -14,22 +14,13 @@ import psutil
 import bentoml
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
-from openllm_core._typing_compat import overload
 from openllm_core.utils import DEBUG, ReprMixin

-
-class DynResource(t.Protocol):
-  resource_id: t.ClassVar[str]
-
-  @classmethod
-  def from_system(cls) -> t.Sequence[t.Any]: ...
-
-
 logger = logging.getLogger(__name__)


 def _strtoul(s: str) -> int:
-  """Return -1 or positive integer sequence string starts with,."""
+  # Return -1 or positive integer sequence string starts with.
  if not s:
    return -1
  idx = 0
@@ -55,21 +46,6 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
  return rcs


-_STACK_LEVEL = 3
-
-
-@overload  # variant: default callback
-def _parse_visible_devices() -> list[str] | None: ...
-
-
-@overload  # variant: specify None, and respect_env
-def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: ...
-
-
-@overload  # variant: default var is something other than None
-def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: ...
-
-
 def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
  """CUDA_VISIBLE_DEVICES aware with default var for parsing spec."""
  if respect_env:
@@ -101,146 +77,136 @@ def _parse_visible_devices(default_var: str | None = None, respect_env: bool = T
  return [str(i) for i in rc]


-def _from_system(cls: type[DynResource]) -> list[str]:
-  visible_devices = _parse_visible_devices()
-  if visible_devices is None:
-    if cls.resource_id == 'amd.com/gpu':
-      if not psutil.LINUX:
-        if DEBUG:
-          logger.debug('AMD GPUs is currently only supported on Linux.')
-        return []
-      # ROCm does not currently have the rocm_smi wheel.
-      # So we need to use the ctypes bindings directly.
-      # we don't want to use CLI because parsing is a pain.
-      sys.path.append('/opt/rocm/libexec/rocm_smi')
-      try:
-        from ctypes import byref, c_uint32
-
-        # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
-        from rsmiBindings import rocmsmi, rsmi_status_t
-
-        device_count = c_uint32(0)
-        ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
-        if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
-          return [str(i) for i in range(device_count.value)]
-        return []
-      # In this case the binary is not found, returning empty list
-      except (ModuleNotFoundError, ImportError):
-        return []
-      finally:
-        sys.path.remove('/opt/rocm/libexec/rocm_smi')
-    else:
-      try:
-        from cuda import cuda
-
-        cuda.cuInit(0)
-        _, dev = cuda.cuDeviceGetCount()
-        return [str(i) for i in range(dev)]
-      except (ImportError, RuntimeError, AttributeError):
-        return []
-  return visible_devices
-
-
-@overload
-def _from_spec(cls: type[DynResource], spec: int) -> list[str]: ...
-
-
-@overload
-def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: ...
-
-
-@overload
-def _from_spec(cls: type[DynResource], spec: str) -> list[str]: ...
-
-
-def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
-  if isinstance(spec, int):
-    if spec in (-1, 0):
-      return []
-    if spec < -1:
-      raise ValueError('Spec cannot be < -1.')
-    return [str(i) for i in range(spec)]
-  elif isinstance(spec, str):
-    if not spec:
-      return []
-    if spec.isdigit():
-      spec = ','.join([str(i) for i in range(_strtoul(spec))])
-    return _parse_visible_devices(spec, respect_env=False)
-  elif isinstance(spec, list):
-    return [str(x) for x in spec]
-  else:
-    raise TypeError(
-      f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
-    )
-
-
 def _raw_device_uuid_nvml() -> list[str] | None:
  from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer

  try:
    nvml_h = CDLL('libnvidia-ml.so.1')
  except Exception:
-    warnings.warn('Failed to find nvidia binding', stacklevel=_STACK_LEVEL)
+    warnings.warn('Failed to find nvidia binding', stacklevel=3)
    return None

  rc = nvml_h.nvmlInit()
  if rc != 0:
-    warnings.warn("Can't initialize NVML", stacklevel=_STACK_LEVEL)
+    warnings.warn("Can't initialize NVML", stacklevel=3)
    return None
  dev_count = c_int(-1)
  rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
  if rc != 0:
-    warnings.warn('Failed to get available device from system.', stacklevel=_STACK_LEVEL)
+    warnings.warn('Failed to get available device from system.', stacklevel=3)
    return None
  uuids: list[str] = []
  for idx in range(dev_count.value):
    dev_id = c_void_p()
    rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
    if rc != 0:
-      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=_STACK_LEVEL)
+      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3)
      return None
    buf_len = 96
    buf = create_string_buffer(buf_len)
    rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
    if rc != 0:
-      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=_STACK_LEVEL)
+      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3)
      return None
    uuids.append(buf.raw.decode('ascii').strip('\0'))
  del nvml_h
  return uuids


-def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
-  if cls.resource_id == 'amd.com/gpu':
-    raise RuntimeError(
-      "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
-    )
-  if not all(isinstance(i, str) for i in val):
-    raise ValueError('Input list should be all string type.')
+class _ResourceMixin:
+  @staticmethod
+  def from_system(cls) -> list[str]:
+    visible_devices = _parse_visible_devices()
+    if visible_devices is None:
+      if cls.resource_id == 'amd.com/gpu':
+        if not psutil.LINUX:
+          if DEBUG:
+            logger.debug('AMD GPUs is currently only supported on Linux.')
+          return []
+        # ROCm does not currently have the rocm_smi wheel.
+        # So we need to use the ctypes bindings directly.
+        # we don't want to use CLI because parsing is a pain.
+        sys.path.append('/opt/rocm/libexec/rocm_smi')
+        try:
+          from ctypes import byref, c_uint32

-  try:
-    from cuda import cuda
+          # refers to https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/python_smi_tools/rsmiBindings.py
+          from rsmiBindings import rocmsmi, rsmi_status_t

-    err, *_ = cuda.cuInit(0)
-    if err != cuda.CUresult.CUDA_SUCCESS:
-      raise RuntimeError('Failed to initialise CUDA runtime binding.')
-    # correctly parse handle
-    for el in val:
-      if el.startswith(('GPU-', 'MIG-')):
-        uuids = _raw_device_uuid_nvml()
-        if uuids is None:
-          raise ValueError('Failed to parse available GPUs UUID')
-        if el not in uuids:
-          raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
-      elif el.isdigit():
-        err, _ = cuda.cuDeviceGet(int(el))
-        if err != cuda.CUresult.CUDA_SUCCESS:
-          raise ValueError(f'Failed to get device {el}')
-  except (ImportError, RuntimeError):
-    pass
+          device_count = c_uint32(0)
+          ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
+          if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
+            return [str(i) for i in range(device_count.value)]
+          return []
+        # In this case the binary is not found, returning empty list
+        except (ModuleNotFoundError, ImportError):
+          return []
+        finally:
+          sys.path.remove('/opt/rocm/libexec/rocm_smi')
+      else:
+        try:
+          from cuda import cuda
+
+          cuda.cuInit(0)
+          _, dev = cuda.cuDeviceGetCount()
+          return [str(i) for i in range(dev)]
+        except (ImportError, RuntimeError, AttributeError):
+          return []
+    return visible_devices
+
+  @staticmethod
+  def from_spec(cls, spec) -> list[str]:
+    if isinstance(spec, int):
+      if spec in (-1, 0):
+        return []
+      if spec < -1:
+        raise ValueError('Spec cannot be < -1.')
+      return [str(i) for i in range(spec)]
+    elif isinstance(spec, str):
+      if not spec:
+        return []
+      if spec.isdigit():
+        spec = ','.join([str(i) for i in range(_strtoul(spec))])
+      return _parse_visible_devices(spec, respect_env=False)
+    elif isinstance(spec, list):
+      return [str(x) for x in spec]
+    else:
+      raise TypeError(
+        f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
+      )
+
+  @staticmethod
+  def validate(cls, val: list[t.Any]) -> None:
+    if cls.resource_id == 'amd.com/gpu':
+      raise RuntimeError(
+        "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
+      )
+    if not all(isinstance(i, str) for i in val):
+      raise ValueError('Input list should be all string type.')
+
+    try:
+      from cuda import cuda
+
+      err, *_ = cuda.cuInit(0)
+      if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      # correctly parse handle
+      for el in val:
+        if el.startswith(('GPU-', 'MIG-')):
+          uuids = _raw_device_uuid_nvml()
+          if uuids is None:
+            raise ValueError('Failed to parse available GPUs UUID')
+          if el not in uuids:
+            raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
+        elif el.isdigit():
+          err, _ = cuda.cuDeviceGet(int(el))
+          if err != cuda.CUresult.CUDA_SUCCESS:
+            raise ValueError(f'Failed to get device {el}')
+    except (ImportError, RuntimeError):
+      pass


-def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
+def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
  return types.new_class(
    name,
    (bentoml.Resource[t.List[str]], ReprMixin),
@@ -248,9 +214,9 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
    lambda ns: ns.update(
      {
        'resource_id': resource_kind,
-        'from_spec': classmethod(_from_spec),
-        'from_system': classmethod(_from_system),
-        'validate': classmethod(_validate),
+        'from_spec': classmethod(_ResourceMixin.from_spec),
+        'from_system': classmethod(_ResourceMixin.from_system),
+        'validate': classmethod(_ResourceMixin.validate),
        '__repr_keys__': property(lambda _: {'resource_id'}),
        '__doc__': inspect.cleandoc(docstring),
        '__module__': 'openllm._strategies',
@@ -259,15 +225,9 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
  )


-# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal 🤦
-_TPU_RESOURCE: t.Literal['cloud-tpus.google.com/v2'] = 'cloud-tpus.google.com/v2'
-_AMD_GPU_RESOURCE: t.Literal['amd.com/gpu'] = 'amd.com/gpu'
-_NVIDIA_GPU_RESOURCE: t.Literal['nvidia.com/gpu'] = 'nvidia.com/gpu'
-_CPU_RESOURCE: t.Literal['cpu'] = 'cpu'
-
 NvidiaGpuResource = _make_resource_class(
  'NvidiaGpuResource',
-  _NVIDIA_GPU_RESOURCE,
+  'nvidia.com/gpu',
  """NVIDIA GPU resource.

    This is a modified version of internal's BentoML's NvidiaGpuResource
@@ -275,7 +235,7 @@ NvidiaGpuResource = _make_resource_class(
 )
 AmdGpuResource = _make_resource_class(
  'AmdGpuResource',
-  _AMD_GPU_RESOURCE,
+  'amd.com/gpu',
  """AMD GPU resource.

    Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -1,9 +1,6 @@
+# fmt: off
 import openllm_client as _client


-def __dir__():
-  return sorted(dir(_client))
-
-
-def __getattr__(it):
-  return getattr(_client, it)
+def __dir__():return sorted(dir(_client))
+def __getattr__(it):return getattr(_client, it)
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -1,75 +0,0 @@
-from __future__ import annotations
-import contextlib
-import logging
-import shutil
-import subprocess
-import typing as t
-
-import bentoml
-import openllm
-
-if t.TYPE_CHECKING:
-  from openllm_core._typing_compat import LiteralBackend, LiteralQuantise
-
-logger = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def build_bento(
-  model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False
-) -> t.Iterator[bentoml.Bento]:
-  logger.info('Building BentoML for %s', model)
-  bento = openllm.build(model, model_id=model_id, quantize=quantize)
-  yield bento
-  if cleanup:
-    logger.info('Deleting %s', bento.tag)
-    bentoml.bentos.delete(bento.tag)
-
-
-@contextlib.contextmanager
-def build_container(
-  bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any
-) -> t.Iterator[str]:
-  if isinstance(bento, bentoml.Bento):
-    bento_tag = bento.tag
-  else:
-    bento_tag = bentoml.Tag.from_taglike(bento)
-  if image_tag is None:
-    image_tag = str(bento_tag)
-  executable = shutil.which('docker')
-  if not executable:
-    raise RuntimeError('docker executable not found')
-  try:
-    logger.info('Building container for %s', bento_tag)
-    bentoml.container.build(bento_tag, backend='docker', image_tag=(image_tag,), progress='plain', **attrs)
-    yield image_tag
-  finally:
-    if cleanup:
-      logger.info('Deleting container %s', image_tag)
-      subprocess.check_output([executable, 'rmi', '-f', image_tag])
-
-
-@contextlib.contextmanager
-def prepare(
-  model: str,
-  model_id: str,
-  backend: LiteralBackend = 'pt',
-  deployment_mode: t.Literal['container', 'local'] = 'local',
-  clean_context: contextlib.ExitStack | None = None,
-  cleanup: bool = True,
-) -> t.Iterator[str]:
-  if clean_context is None:
-    clean_context = contextlib.ExitStack()
-    cleanup = True
-  llm = openllm.LLM[t.Any, t.Any](model_id, backend=backend)
-  bento_tag = bentoml.Tag.from_taglike(f'{llm.llm_type}-service:{llm.tag.version}')
-  if not bentoml.list(bento_tag):
-    bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup))
-  else:
-    bento = bentoml.get(bento_tag)
-  container_name = f'openllm-{model}-{llm.llm_type}'.replace('-', '_')
-  if deployment_mode == 'container':
-    container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup))
-  yield container_name
-  if cleanup:
-    clean_context.close()
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
--- a/openllm-python/src/openllm/utils/init.pyi
+++ b/openllm-python/src/openllm/utils/init.pyi
@@ -44,7 +44,6 @@ from openllm_core.utils import (
  is_transformers_available as is_transformers_available,
  is_vllm_available as is_vllm_available,
  lenient_issubclass as lenient_issubclass,
-  reserve_free_port as reserve_free_port,
  resolve_filepath as resolve_filepath,
  resolve_user_filepath as resolve_user_filepath,
  serde as serde,