diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index 2f59d63f..02ee1001 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -138,11 +138,6 @@ def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any:
   if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result()
   else: return loop.run_until_complete(coro)
 
-def available_devices() -> tuple[str, ...]:
-  """Return available GPU under system. Currently only supports NVIDIA GPUs."""
-  from openllm_core._strategies import NvidiaGpuResource
-  return tuple(NvidiaGpuResource.from_system())
-
 @functools.lru_cache(maxsize=128)
 def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1') -> str:
   """Generate a hash from given file's modification time.
@@ -156,10 +151,6 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1'
   """
   return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest()
 
-@functools.lru_cache(maxsize=1)
-def device_count() -> int:
-  return len(available_devices())
-
 def check_bool_env(env: str, default: bool = True) -> bool:
   v = os.environ.get(env, str(default)).upper()
   if v.isdigit(): return bool(int(v))  # special check for digits
diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py
index fb6be61b..66c32c98 100644
--- a/openllm-core/src/openllm_core/utils/dantic.py
+++ b/openllm-core/src/openllm_core/utils/dantic.py
@@ -399,7 +399,7 @@ class CudaValueType(ParamType):
     param: The parameter that is requesting completion.
     incomplete: Value being completed. May be empty.
     """
-    from openllm_core.utils import available_devices
+    from openllm.utils import available_devices
     mapping = incomplete.split(self.envvar_list_splitter) if incomplete else available_devices()
     return [sc.CompletionItem(str(i), help=f'CUDA device index {i}') for i in mapping]
 
diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py
index c12e6807..be0aa405 100644
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -15,7 +15,6 @@ from openllm_core._schemas import GenerationOutput
 from openllm_core._typing_compat import LiteralBackend
 from openllm_core._typing_compat import M
 from openllm_core._typing_compat import T
-from openllm_core.utils import device_count
 from openllm_core.utils import first_not_none
 from openllm_core.utils import get_debug_mode
 from openllm_core.utils import is_vllm_available
@@ -41,7 +40,7 @@ class vLLMRunnable(bentoml.Runnable):
 
   def __init__(self, llm: openllm.LLM[M, T]) -> None:
     self.config = llm.config
-    num_gpus, dev = 1, device_count()
+    num_gpus, dev = 1, openllm.utils.device_count()
     if dev >= 2: num_gpus = min(dev // 2 * 2, dev)
     quantization = None
     if llm._quantise and llm._quantise == 'awq': quantization = llm._quantise
@@ -49,6 +48,7 @@ class vLLMRunnable(bentoml.Runnable):
       self.model = vllm.AsyncLLMEngine.from_engine_args(
           vllm.AsyncEngineArgs(model=llm.bentomodel.path,
                                tokenizer=llm.bentomodel.path,
+                               trust_remote_code=llm.trust_remote_code,
                                tokenizer_mode='auto',
                                tensor_parallel_size=num_gpus,
                                dtype='auto',
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 0bfe1e75..5b6ba480 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -202,7 +202,7 @@ def create_bento(bento_tag: bentoml.Tag,
   if isinstance(workers_per_resource, str):
     if workers_per_resource == 'round_robin': workers_per_resource = 1.0
     elif workers_per_resource == 'conserved':
-      workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
+      workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
     else:
       try:
         workers_per_resource = float(workers_per_resource)
diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
index b144652e..e10e051a 100644
--- a/openllm-python/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -137,8 +137,6 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
     if not _BUILDER.health(): raise openllm.exceptions.Error
   except (openllm.exceptions.Error, subprocess.CalledProcessError):
     raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None
-  if openllm_core.utils.device_count() == 0:
-    raise RuntimeError('Building base container requires GPUs (None available)')
   if not shutil.which('nvidia-container-runtime'):
     raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
   if not _module_location:
@@ -147,9 +145,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
   if not pyproject_path.exists():
     raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
   if not registries:
-    tags: dict[str | LiteralContainerRegistry, str] = {
-        alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
-    }  # default to all registries with latest tag strategy
+    tags: dict[str | LiteralContainerRegistry, str] = {alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()}
   else:
     registries = [registries] if isinstance(registries, str) else list(registries)
     tags = {name: f'{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}' for name in registries}
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index 59169725..89f28a18 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -6,6 +6,7 @@ we won't ensure backward compatibility for these functions. So use with caution.
 from __future__ import annotations
 import typing as t
 
+import functools
 import openllm_core
 
 if t.TYPE_CHECKING:
@@ -14,7 +15,16 @@ if t.TYPE_CHECKING:
 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
   return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}
 
-__all__ = ['generate_labels']
+def available_devices() -> tuple[str, ...]:
+  """Return available GPU under system. Currently only supports NVIDIA GPUs."""
+  from .._strategies import NvidiaGpuResource
+  return tuple(NvidiaGpuResource.from_system())
+
+@functools.lru_cache(maxsize=1)
+def device_count() -> int:
+  return len(available_devices())
+
+__all__ = ['generate_labels', 'available_devices', 'device_count']
 
 def __dir__() -> t.Sequence[str]:
   return sorted(__all__)