From 96318b65ee060f76c81c60c13d0ebe83adf92ffd Mon Sep 17 00:00:00 2001
From: Aaron <29749331+aarnphm@users.noreply.github.com>
Date: Sun, 26 Nov 2023 04:53:36 -0500
Subject: [PATCH] fix(sdk): remove broken sdk

codespace now around 2.8k lines

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 .../src/openllm_core/utils/__init__.py        |   4 +-
 openllm-python/src/openllm/__init__.py        |  18 +-
 openllm-python/src/openllm/_deprecated.py     |  83 ++------
 openllm-python/src/openllm/_llm.py            |  69 ++-----
 openllm-python/src/openllm/_quantisation.py   |  19 +-
 openllm-python/src/openllm/_service.py        |  55 ++----
 openllm-python/src/openllm/_service_vars.py   |   9 +-
 openllm-python/src/openllm/_strategies.py     | 181 +++++-------------
 openllm-python/src/openllm/bundle/__init__.py |   7 +-
 openllm-python/src/openllm/bundle/_package.py |  38 ++--
 openllm-python/src/openllm/client.py          |  12 +-
 .../src/openllm/entrypoints/__init__.py       |  13 +-
 .../src/openllm/entrypoints/cohere.py         |  29 +--
 openllm-python/src/openllm/entrypoints/hf.py  |  22 +--
 openllm-python/src/openllm/exceptions.py      |   9 +-
 .../src/openllm/protocol/__init__.py          |   9 +-
 openllm-python/src/openllm/utils.py           |  32 +---
 openllm-python/src/openllm_cli/_sdk.py        | 127 ++++--------
 18 files changed, 179 insertions(+), 557 deletions(-)

diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index f1db9f89..574c94ee 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -234,6 +234,4 @@ __lazy = LazyModule(
   },
   extra_objects=_extras,
 )
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index 9120074b..8085e06e 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -1,7 +1,6 @@
 import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
 from openllm_cli import _sdk
 from . import utils as utils
-
 if utils.DEBUG:
   utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET)
 else:
@@ -12,11 +11,8 @@ else:
   _warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
   _warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
   _warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
-
 COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
-
-# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
-__lazy = utils.LazyModule(
+__lazy = utils.LazyModule(  # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
   __name__,
   globals()['__file__'],
   {
@@ -34,14 +30,8 @@ __lazy = utils.LazyModule(
     '_llm': ['LLM'],
   },
   extra_objects={
-    'COMPILED': COMPILED,
-    'start': _sdk.start,
-    'start_grpc': _sdk.start_grpc,
-    'build': _sdk.build,
-    'import_model': _sdk.import_model,
-    'list_models': _sdk.list_models,
+    'COMPILED': COMPILED, 'start': _sdk.start, 'build': _sdk.build, #
+    'import_model': _sdk.import_model, 'list_models': _sdk.list_models, #
   },
 )
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
diff --git a/openllm-python/src/openllm/_deprecated.py b/openllm-python/src/openllm/_deprecated.py
index c4a39a4e..a1ffbbdb 100644
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -1,65 +1,21 @@
 from __future__ import annotations
-import logging
-import os
-import typing as t
-import warnings
-
+import logging, os, warnings, typing as t
 import openllm
-from openllm_core._typing_compat import LiteralBackend, ParamSpec
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core.utils import first_not_none, getenv, is_vllm_available
 
-if t.TYPE_CHECKING:
-  from ._runners import Runner as _Runner
-
-P = ParamSpec('P')
-
+__all__ = ['Runner']
 logger = logging.getLogger(__name__)
 
-
 def Runner(
-  model_name: str,
-  ensure_available: bool = True,
-  init_local: bool = False,
-  backend: LiteralBackend | None = None,
-  llm_config: openllm.LLMConfig | None = None,
-  **attrs: t.Any,
-) -> _Runner[t.Any, t.Any]:
-  """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
-
-  > [!WARNING]
-  > This method is now deprecated and in favor of 'openllm.LLM'
-
-  ```python
-  runner = openllm.Runner("dolly-v2")
-
-  @svc.on_startup
-  def download():
-    runner.download_model()
-  ```
-
-  if `init_local=True` (For development workflow), it will also enable `ensure_available`.
-  Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour.
-
-  Args:
-    model_name: Supported model name from 'openllm models'
-    ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
-                      If False, make sure the model is available locally. Default to True, and openllm.LLM will always check if models
-                      are available locally. based on generated tag.
-    backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
-    llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
-    init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
-    **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
-  """
-  from ._llm import LLM
-
-  if llm_config is None:
-    llm_config = openllm.AutoConfig.for_model(model_name)
-  if not ensure_available:
-    logger.warning(
-      "'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
-    )
+  model_name: str, ensure_available: bool = True, #
+  init_local: bool = False, backend: LiteralBackend | None = None, #
+  llm_config: openllm.LLMConfig | None = None, **attrs: t.Any,
+):
+  if llm_config is None: llm_config = openllm.AutoConfig.for_model(model_name)
+  if not ensure_available: logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
   model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
-  _RUNNER_MSG = f'''\
+  warnings.warn(f'''\
   Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:
 
   ```python
@@ -70,22 +26,11 @@ def Runner(
   @svc.api(...)
   async def chat(input: str) -> str:
     async for it in llm.generate_iterator(input): print(it)
-  ```
-    '''
-  warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
+  ```''', DeprecationWarning, stacklevel=2)
   attrs.update(
     {
-      'model_id': model_id,
-      'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),
-      'serialisation': getenv(
-        'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
-      ),
+      'model_id': model_id, 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
+      'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
     }
   )
-
-  backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'))
-  llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, embedded=init_local, **attrs)
-  return llm.runner
-
-
-__all__ = ['Runner']
+  return openllm.LLM(backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs).runner
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 9b622697..2dbbc563 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -47,23 +47,17 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]
 
 @attr.define(slots=False, repr=False, init=False)
 class LLM(t.Generic[M, T]):
-  async def generate(
-    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
-  ) -> GenerationOutput:
-    if adapter_name is not None and self.__llm_backend__ != 'pt':
-      raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
+  async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
+    if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
     config = self.config.model_construct_env(**attrs)
     texts, token_ids = [[]] * config['n'], [[]] * config['n']
-    final_result = None
     async for result in self.generate_iterator(
       prompt, prompt_token_ids, stop, stop_token_ids, request_id, adapter_name, **config.model_dump(flatten=True)
     ):
       for output in result.outputs:
         texts[output.index].append(output.text)
         token_ids[output.index].extend(output.token_ids)
-      final_result = result
-    if final_result is None:
-      raise RuntimeError('No result is returned.')
+    if (final_result := result) is None: raise RuntimeError('No result is returned.')
     return final_result.with_options(
       prompt=prompt,
       outputs=[
@@ -72,13 +66,9 @@ class LLM(t.Generic[M, T]):
       ],
     )
 
-  async def generate_iterator(
-    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
-  ) -> t.AsyncGenerator[GenerationOutput, None]:
+  async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
     from bentoml._internal.runner.runner_handle import DummyRunnerHandle
-
-    if adapter_name is not None and self.__llm_backend__ != 'pt':
-      raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
+    if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
 
     if isinstance(self.runner._runner_handle, DummyRunnerHandle):
       if os.getenv('BENTO_PATH') is not None:
@@ -87,14 +77,12 @@ class LLM(t.Generic[M, T]):
         self.runner.init_local(quiet=True)
     config = self.config.model_construct_env(**attrs)
 
-    if stop_token_ids is None: stop_token_ids = []
+    stop_token_ids = stop_token_ids or []
     eos_token_id = attrs.get('eos_token_id', config['eos_token_id'])
-    if eos_token_id is not None:
-      if not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
-      stop_token_ids.extend(eos_token_id)
-    if config['eos_token_id'] and config['eos_token_id'] not in stop_token_ids: stop_token_ids.append(config['eos_token_id'])
-    if self.tokenizer.eos_token_id not in stop_token_ids:
-      stop_token_ids.append(self.tokenizer.eos_token_id)
+    if eos_token_id and not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
+    stop_token_ids.extend(eos_token_id or [])
+    if (config_eos := config['eos_token_id']) and config_eos not in stop_token_ids: stop_token_ids.append(config_eos)
+    if self.tokenizer.eos_token_id not in stop_token_ids: stop_token_ids.append(self.tokenizer.eos_token_id)
     if stop is None:
       stop = set()
     elif isinstance(stop, str):
@@ -102,20 +90,16 @@ class LLM(t.Generic[M, T]):
     else:
       stop = set(stop)
     for tid in stop_token_ids:
-      if tid:
-        stop.add(self.tokenizer.decode(tid))
+      if tid: stop.add(self.tokenizer.decode(tid))
 
     if prompt_token_ids is None:
-      if prompt is None:
-        raise ValueError('Either prompt or prompt_token_ids must be specified.')
+      if prompt is None: raise ValueError('Either prompt or prompt_token_ids must be specified.')
       prompt_token_ids = self.tokenizer.encode(prompt)
 
     request_id = gen_random_uuid() if request_id is None else request_id
     previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
     try:
-      generator = self.runner.generate_iterator.async_stream(
-        prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True)
-      )
+      generator = self.runner.generate_iterator.async_stream(prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True))
     except Exception as err:
       raise RuntimeError(f'Failed to start generation task: {err}') from err
 
@@ -134,18 +118,11 @@ class LLM(t.Generic[M, T]):
 
   # NOTE: If you are here to see how generate_iterator and generate works, see above.
   # The below are mainly for internal implementation that you don't have to worry about.
-  _model_id: str
-  _revision: t.Optional[str]
+  _model_id: str; _revision: t.Optional[str] #
   _quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
-  _quantise: t.Optional[LiteralQuantise]
-  _model_decls: t.Tuple[t.Any, ...]
-  __model_attrs: t.Dict[str, t.Any]
-  __tokenizer_attrs: t.Dict[str, t.Any]
-  _tag: bentoml.Tag
-  _adapter_map: t.Optional[AdapterMap]
-  _serialisation: LiteralSerialisation
-  _local: bool
-  _max_model_len: t.Optional[int]
+  _quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] #
+  __tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] #
+  _serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] #
 
   __llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
   __llm_torch_dtype__: 'torch.dtype' = None
@@ -180,12 +157,7 @@ class LLM(t.Generic[M, T]):
   ):
     torch_dtype = attrs.pop('torch_dtype', None)  # backward compatible
     if torch_dtype is not None:
-      warnings.warn(
-        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
-        DeprecationWarning,
-        stacklevel=3,
-      )
-      dtype = torch_dtype
+      warnings.warn('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3); dtype = torch_dtype
     _local = False
     if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True
     backend = getenv('backend', default=backend)
@@ -291,7 +263,7 @@ class LLM(t.Generic[M, T]):
       if is_vllm_available():
         return 'vllm'
       elif is_ctranslate_available():
-        return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
+        return 'ctranslate'
     elif is_ctranslate_available():
       return 'ctranslate'
     else:
@@ -449,8 +421,7 @@ def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
         config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
       except Exception as err:
         raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
-    with open(config_file, 'r') as file:
-      resolved_config = orjson.loads(file.read())
+    with open(config_file, 'r') as file: resolved_config = orjson.loads(file.read())
     _peft_type = resolved_config['peft_type'].lower()
     if _peft_type not in resolved: resolved[_peft_type] = ()
     resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
index 9224f17e..5e430662 100644
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,13 +1,8 @@
 from __future__ import annotations
-
 from openllm_core.exceptions import MissingDependencyError
 from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available
-
-
 def infer_quantisation_config(llm, quantise, **attrs):
-  import torch
-  import transformers
-
+  import torch, transformers
   # 8 bit configuration
   int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
   int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -85,25 +80,19 @@ def infer_quantisation_config(llm, quantise, **attrs):
 
   # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
   if not is_bitsandbytes_available():
-    raise RuntimeError(
-      'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
-    )
+    raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
   if quantise == 'int8':
     quantisation_config = create_int8_config(int8_skip_modules)
   elif quantise == 'int4':
     quantisation_config = create_int4_config()
   elif quantise == 'gptq':
     if not is_autogptq_available():
-      raise MissingDependencyError(
-        "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
-      )
+      raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
     else:
       quantisation_config = create_gptq_config()
   elif quantise == 'awq':
     if not is_autoawq_available():
-      raise MissingDependencyError(
-        "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
-      )
+      raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
     else:
       quantisation_config = create_awq_config()
   else:
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 58efe43a..1ecea673 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,66 +1,43 @@
 from __future__ import annotations
-import logging
-import typing as t
-
+import logging, typing as t
 import _service_vars as svars
-
-import bentoml
-import openllm
+import bentoml, openllm
 from openllm_core._schemas import MessageParam
 from bentoml.io import JSON, Text
 
 logger = logging.getLogger(__name__)
-
 llm = openllm.LLM[t.Any, t.Any](
-  model_id=svars.model_id,
-  model_tag=svars.model_tag,
-  serialisation=svars.serialization,
-  adapter_map=svars.adapter_map,
-  trust_remote_code=svars.trust_remote_code,
+  model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, #
+  serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code,
 )
 svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
-
 llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
 
-
 @svc.api(
   route='/v1/generate',
-  input=JSON.from_sample(llm_model_class.examples()),
-  output=JSON.from_sample(openllm.GenerationOutput.examples()),
+  input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
 )
-async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
-  return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
-
+async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
 
 @svc.api(
   route='/v1/generate_stream',
-  input=JSON.from_sample(llm_model_class.examples()),
-  output=Text(content_type='text/event-stream'),
+  input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
 )
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
   async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
     yield f'data: {it.model_dump_json()}\n\n'
   yield 'data: [DONE]\n\n'
 
-
 _Metadata = openllm.MetadataOutput(
-  timeout=llm.config['timeout'],
-  model_name=llm.config['model_name'],
-  backend=llm.__llm_backend__,
-  model_id=llm.model_id,
+  timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
+  backend=llm.__llm_backend__, model_id=llm.model_id, #
   configuration=llm.config.model_dump_json().decode(),
 )
 
-
 @svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
-def metadata_v1(_: str) -> openllm.MetadataOutput:
-  return _Metadata
-
-
-class MessagesConverterInput(t.TypedDict):
-  add_generation_prompt: bool
-  messages: t.List[t.Dict[str, t.Any]]
+def metadata_v1(_: str) -> openllm.MetadataOutput: return _Metadata
 
+class MessagesConverterInput(t.TypedDict): add_generation_prompt: bool; messages: t.List[t.Dict[str, t.Any]]
 
 @svc.api(
   route='/v1/helpers/messages',
@@ -69,18 +46,14 @@ class MessagesConverterInput(t.TypedDict):
       add_generation_prompt=False,
       messages=[
         MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
-        MessageParam(role='user', content='Hi there!'),
-        MessageParam(role='assistant', content='Yes?'),
+        MessageParam(role='user', content='Hi there!'), MessageParam(role='assistant', content='Yes?'), #
       ],
     )
   ),
   output=Text(),
 )
 def helpers_messages_v1(message: MessagesConverterInput) -> str:
-  add_generation_prompt = message['add_generation_prompt']
-  messages = message['messages']
+  add_generation_prompt, messages = message['add_generation_prompt'], message['messages']
   return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
 
-
-# HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
-openllm.mount_entrypoints(svc, llm)
+openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
diff --git a/openllm-python/src/openllm/_service_vars.py b/openllm-python/src/openllm/_service_vars.py
index d8ac5594..9d6f5da4 100644
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,9 +1,2 @@
 import os, orjson, openllm_core.utils as coreutils
-
-model_id, model_tag, adapter_map, serialization, trust_remote_code = (
-  os.environ['OPENLLM_MODEL_ID'],
-  None,
-  orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
-  os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
-  coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
-)
+model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False)
diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py
index a37e4b7f..ec1c297a 100644
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -4,44 +4,33 @@ import psutil, bentoml, openllm_core.utils as coreutils
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
 
+__all__ = ['CascadingResourceStrategy', 'get_resource']
 logger = logging.getLogger(__name__)
 
-
 def _strtoul(s: str) -> int:
   # Return -1 or positive integer sequence string starts with.
-  if not s:
-    return -1
+  if not s: return -1
   idx = 0
   for idx, c in enumerate(s):
-    if not (c.isdigit() or (idx == 0 and c in '+-')):
-      break
-    if idx + 1 == len(s):
-      idx += 1  # noqa: PLW2901
+    if not (c.isdigit() or (idx == 0 and c in '+-')): break
+    if idx + 1 == len(s): idx += 1  # noqa: PLW2901
   # NOTE: idx will be set via enumerate
   return int(s[:idx]) if idx > 0 else -1
-
-
 def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
-  rcs: list[str] = []
+  rcs = []
   for elem in lst.split(','):
     # Repeated id results in empty set
-    if elem in rcs:
-      return []
+    if elem in rcs: return []
     # Anything other but prefix is ignored
-    if not elem.startswith(prefix):
-      break
+    if not elem.startswith(prefix): break
     rcs.append(elem)
   return rcs
-
-
 def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
   if respect_env:
     spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
-    if not spec:
-      return None
+    if not spec: return None
   else:
-    if default_var is None:
-      raise ValueError('spec is required to be not None when parsing spec.')
+    if default_var is None: raise ValueError('spec is required to be not None when parsing spec.')
     spec = default_var
 
   if spec.startswith('GPU-'):
@@ -55,64 +44,52 @@ def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: boo
   for el in spec.split(','):
     x = _strtoul(el.strip())
     # Repeated ordinal results in empty set
-    if x in rc:
-      return []
+    if x in rc: return []
     # Negative value aborts the sequence
-    if x < 0:
-      break
+    if x < 0: break
     rc.append(x)
   return [str(i) for i in rc]
-
-
 def _raw_device_uuid_nvml() -> list[str] | None:
   from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
 
   try:
     nvml_h = CDLL('libnvidia-ml.so.1')
   except Exception:
-    warnings.warn('Failed to find nvidia binding', stacklevel=3)
-    return None
+    warnings.warn('Failed to find nvidia binding', stacklevel=3); return None
 
   rc = nvml_h.nvmlInit()
   if rc != 0:
-    warnings.warn("Can't initialize NVML", stacklevel=3)
-    return None
+    warnings.warn("Can't initialize NVML", stacklevel=3); return None
   dev_count = c_int(-1)
   rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
   if rc != 0:
-    warnings.warn('Failed to get available device from system.', stacklevel=3)
-    return None
-  uuids: list[str] = []
+    warnings.warn('Failed to get available device from system.', stacklevel=3); return None
+  uuids = []
   for idx in range(dev_count.value):
     dev_id = c_void_p()
     rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
     if rc != 0:
-      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3)
-      return None
+      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3); return None
     buf_len = 96
     buf = create_string_buffer(buf_len)
     rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
     if rc != 0:
-      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3)
-      return None
+      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3); return None
     uuids.append(buf.raw.decode('ascii').strip('\0'))
   del nvml_h
   return uuids
 
-
 class _ResourceMixin:
   @staticmethod
   def from_system(cls) -> list[str]:
     visible_devices = _parse_cuda_visible_devices()
     if visible_devices is None:
       if cls.resource_id == 'amd.com/gpu':
-        if not psutil.LINUX:
-          if coreutils.DEBUG:
-            logger.debug('AMD GPUs is currently only supported on Linux.')
-          return []
+        if not psutil.LINUX: return []
         # ROCm does not currently have the rocm_smi wheel.
         # So we need to use the ctypes bindings directly.
         # we don't want to use CLI because parsing is a pain.
+        # TODO: Use tinygrad/gpuctypes
         sys.path.append('/opt/rocm/libexec/rocm_smi')
         try:
           from ctypes import byref, c_uint32
@@ -122,8 +99,7 @@ class _ResourceMixin:
 
           device_count = c_uint32(0)
           ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
-          if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
-            return [str(i) for i in range(device_count.value)]
+          if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: return [str(i) for i in range(device_count.value)]
           return []
         # In this case the binary is not found, returning empty list
         except (ModuleNotFoundError, ImportError):
@@ -140,59 +116,43 @@ class _ResourceMixin:
         except (ImportError, RuntimeError, AttributeError):
           return []
     return visible_devices
-
   @staticmethod
   def from_spec(cls, spec) -> list[str]:
     if isinstance(spec, int):
-      if spec in (-1, 0):
-        return []
-      if spec < -1:
-        raise ValueError('Spec cannot be < -1.')
+      if spec in (-1, 0): return []
+      if spec < -1: raise ValueError('Spec cannot be < -1.')
       return [str(i) for i in range(spec)]
     elif isinstance(spec, str):
-      if not spec:
-        return []
-      if spec.isdigit():
-        spec = ','.join([str(i) for i in range(_strtoul(spec))])
+      if not spec: return []
+      if spec.isdigit(): spec = ','.join([str(i) for i in range(_strtoul(spec))])
       return _parse_cuda_visible_devices(spec, respect_env=False)
     elif isinstance(spec, list):
       return [str(x) for x in spec]
     else:
-      raise TypeError(
-        f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
-      )
-
+      raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
   @staticmethod
   def validate(cls, val: list[t.Any]) -> None:
     if cls.resource_id == 'amd.com/gpu':
-      raise RuntimeError(
-        "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
-      )
+      raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
     if not all(isinstance(i, str) for i in val):
       raise ValueError('Input list should be all string type.')
 
     try:
       from cuda import cuda
-
       err, *_ = cuda.cuInit(0)
-      if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
       # correctly parse handle
       for el in val:
         if el.startswith(('GPU-', 'MIG-')):
           uuids = _raw_device_uuid_nvml()
-          if uuids is None:
-            raise ValueError('Failed to parse available GPUs UUID')
-          if el not in uuids:
-            raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
+          if uuids is None: raise ValueError('Failed to parse available GPUs UUID')
+          if el not in uuids: raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
         elif el.isdigit():
           err, _ = cuda.cuDeviceGet(int(el))
-          if err != cuda.CUresult.CUDA_SUCCESS:
-            raise ValueError(f'Failed to get device {el}')
+          if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}')
     except (ImportError, RuntimeError):
       pass
 
-
 def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
   return types.new_class(
     name,
@@ -201,22 +161,16 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
     lambda ns: ns.update(
       {
         'resource_id': resource_kind,
-        'from_spec': classmethod(_ResourceMixin.from_spec),
-        'from_system': classmethod(_ResourceMixin.from_system),
-        'validate': classmethod(_ResourceMixin.validate),
-        '__repr_keys__': property(lambda _: {'resource_id'}),
-        '__doc__': inspect.cleandoc(docstring),
-        '__module__': 'openllm._strategies',
+        'from_spec': classmethod(_ResourceMixin.from_spec), 'from_system': classmethod(_ResourceMixin.from_system), #
+        'validate': classmethod(_ResourceMixin.validate), '__repr_keys__': property(lambda _: {'resource_id'}), #
+        '__doc__': inspect.cleandoc(docstring), '__module__': 'openllm._strategies', #
       }
     ),
   )
-
-
 NvidiaGpuResource = _make_resource_class(
   'NvidiaGpuResource',
   'nvidia.com/gpu',
   '''NVIDIA GPU resource.
-
     This is a modified version of internal's BentoML's NvidiaGpuResource
     where it respects and parse CUDA_VISIBLE_DEVICES correctly.''',
 )
@@ -224,73 +178,53 @@ AmdGpuResource = _make_resource_class(
   'AmdGpuResource',
   'amd.com/gpu',
   '''AMD GPU resource.
-
     Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
     ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''',
 )
 
-
 class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
   @classmethod
   def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
-    if resource_request is None:
-      resource_request = system_resources()
+    if resource_request is None: resource_request = system_resources()
     # use NVIDIA
     kind = 'nvidia.com/gpu'
     nvidia_req = get_resource(resource_request, kind)
-    if nvidia_req is not None:
-      return 1
+    if nvidia_req is not None: return 1
     # use AMD
     kind = 'amd.com/gpu'
     amd_req = get_resource(resource_request, kind, validate=False)
-    if amd_req is not None:
-      return 1
+    if amd_req is not None: return 1
     # use CPU
     cpus = get_resource(resource_request, 'cpu')
     if cpus is not None and cpus > 0:
-      if 'cpu' not in runnable_class.SUPPORTED_RESOURCES:
-        logger.warning('No known supported resource available for %s, falling back to using CPU.', runnable_class)
-
       if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
-        if isinstance(workers_per_resource, float) and workers_per_resource < 1.0:
-          raise ValueError('Fractional CPU multi threading support is not yet supported.')
+        if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: raise ValueError('Fractional CPU multi threading support is not yet supported.')
         return int(workers_per_resource)
       return math.ceil(cpus) * workers_per_resource
-
     # this should not be reached by user since we always read system resource as default
-    raise ValueError(
-      f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.'
-    )
-
+    raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.')
   @classmethod
   def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
     cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
     disabled = cuda_env in ('', '-1')
-    environ: dict[str, t.Any] = {}
+    environ = {}
 
-    if resource_request is None:
-      resource_request = system_resources()
+    if resource_request is None: resource_request = system_resources()
     # use NVIDIA
     kind = 'nvidia.com/gpu'
     typ = get_resource(resource_request, kind)
     if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
       if disabled:
-        logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
-        environ['CUDA_VISIBLE_DEVICES'] = cuda_env
-        return environ
+        environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
       environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
-      logger.debug('Environ for worker %s: %s', worker_index, environ)
       return environ
     # use AMD
     kind = 'amd.com/gpu'
     typ = get_resource(resource_request, kind, validate=False)
     if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
       if disabled:
-        logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
-        environ['CUDA_VISIBLE_DEVICES'] = cuda_env
-        return environ
+        environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
       environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
-      logger.debug('Environ for worker %s: %s', worker_index, environ)
       return environ
     # use CPU
     cpus = get_resource(resource_request, 'cpu')
@@ -298,25 +232,17 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
       environ['CUDA_VISIBLE_DEVICES'] = '-1'  # disable gpu
       if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
         thread_count = math.ceil(cpus)
-        for thread_env in THREAD_ENVS:
-          environ[thread_env] = os.environ.get(thread_env, str(thread_count))
-        logger.debug('Environ for worker %s: %s', worker_index, environ)
+        for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, str(thread_count))
         return environ
-      for thread_env in THREAD_ENVS:
-        environ[thread_env] = os.environ.get(thread_env, '1')
+      for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, '1')
       return environ
     return environ
-
   @staticmethod
   def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
     # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
     if isinstance(workers_per_resource, float):
-      # NOTE: We hit this branch when workers_per_resource is set to
-      # float, for example 0.5 or 0.25
-      if workers_per_resource > 1:
-        raise ValueError(
-          "Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case."
-        )
+      # NOTE: We hit this branch when workers_per_resource is set to float, for example 0.5 or 0.25
+      if workers_per_resource > 1: raise ValueError('workers_per_resource > 1 is not supported.')
       # We are round the assigned resource here. This means if workers_per_resource=.4
       # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
       assigned_resource_per_worker = round(1 / workers_per_resource)
@@ -327,21 +253,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
           worker_index,
           assigned_resource_per_worker,
         )
-        raise IndexError(
-          f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
-        )
-      assigned_gpu = gpus[
-        assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
-      ]
+        raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
+      assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
       dev = ','.join(assigned_gpu)
     else:
       idx = worker_index // workers_per_resource
       if idx >= len(gpus):
-        raise ValueError(
-          f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
-        )
+        raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
       dev = str(gpus[idx])
     return dev
-
-
-__all__ = ['CascadingResourceStrategy', 'get_resource']
diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py
index 6ace1c6a..5dbe6806 100644
--- a/openllm-python/src/openllm/bundle/__init__.py
+++ b/openllm-python/src/openllm/bundle/__init__.py
@@ -4,7 +4,6 @@ from openllm_core._typing_compat import LiteralVersionStrategy
 from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils.lazy import VersionInfo, LazyModule
 
-_OWNER, _REPO = 'bentoml', 'openllm'
 @attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
   git_hash: str = attr.field()
@@ -17,7 +16,7 @@ class RefResolver:
     if strategy_or_version is None or strategy_or_version == 'release':
       try:
         from ghapi.all import GhApi
-        ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
+        ghapi = GhApi(owner='bentoml', repo='openllm', authenticate=False)
         meta = ghapi.repos.get_latest_release()
         git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
       except Exception as err:
@@ -35,6 +34,4 @@ __lazy = LazyModule(
   {'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']},
   extra_objects={'RefResolver': RefResolver}
 )
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index c5a66b5d..83458f67 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,15 +1,7 @@
-# mypy: disable-error-code="misc"
 from __future__ import annotations
-import importlib.metadata
-import logging
-import os
-from pathlib import Path
-
-import orjson
+import importlib.metadata, logging, os, pathlib
+import bentoml, orjson, openllm_core
 from simple_di import Provide, inject
-
-import bentoml
-import openllm_core
 from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
@@ -17,7 +9,7 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
 logger = logging.getLogger(__name__)
 
 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
-_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
+_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
 _SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''
 
 def build_editable(path, package='openllm'):
@@ -28,7 +20,7 @@ def build_editable(path, package='openllm'):
   from build.env import IsolatedEnvBuilder
   module_location = pkg.source_locations(package)
   if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.')
-  pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
+  pyproject_path = pathlib.Path(module_location).parent.parent / 'pyproject.toml'
   if os.path.isfile(pyproject_path.__fspath__()):
     with IsolatedEnvBuilder() as env:
       builder = ProjectBuilder(pyproject_path.parent)
@@ -70,12 +62,9 @@ def create_bento(
   labels = dict(llm.identifying_params)
   labels.update(
     {
-      '_type': llm.llm_type, '_framework': llm.__llm_backend__, 'start_name': llm.config['start_name'],
-      'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
-      **{
-        f'{package.replace("-","_")}_version': importlib.metadata.version(package)
-        for package in {'openllm', 'openllm-core', 'openllm-client'}
-      },
+      '_type': llm.llm_type, '_framework': llm.__llm_backend__,
+      'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
+      **{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
     }
   )
   if adapter_map: labels.update(adapter_map)
@@ -83,18 +72,15 @@ def create_bento(
   logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
   logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
   script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
-    __model_id__=llm.model_id,
-    __model_tag__=str(llm.tag),
-    __model_adapter_map__=orjson.dumps(adapter_map).decode(),
-    __model_serialization__=llm.config['serialisation'],
+    __model_id__=llm.model_id, __model_tag__=str(llm.tag), #
+    __model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], #
     __model_trust_remote_code__=str(llm.trust_remote_code),
   )
   if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
   llm_fs.writetext('_service_vars.py', script)
   with open(_service_file.__fspath__(), 'r') as f: service_src = f.read()
   llm_fs.writetext(llm.config['service_name'], service_src)
-
-  bento = bentoml.Bento.create(
+  return bentoml.Bento.create(
     version=bento_tag.version,
     build_ctx=llm_fs.getsyspath('/'),
     build_config=BentoBuildConfig(
@@ -108,6 +94,4 @@ def create_bento(
       python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
       docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation),
     ),
-  )
-
-  return bento.save(bento_store=_bento_store, model_store=_model_store)
+  ).save(bento_store=_bento_store, model_store=_model_store)
diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py
index 8c5c8fc1..591aecc1 100644
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -1,10 +1,2 @@
-def __dir__():
-  import openllm_client as _client
-
-  return sorted(dir(_client))
-
-
-def __getattr__(it):
-  import openllm_client as _client
-
-  return getattr(_client, it)
+def __dir__(): import openllm_client as _client; return sorted(dir(_client))
+def __getattr__(it): import openllm_client as _client; return getattr(_client, it)
diff --git a/openllm-python/src/openllm/entrypoints/__init__.py b/openllm-python/src/openllm/entrypoints/__init__.py
index b2b4e85a..fc64d69b 100644
--- a/openllm-python/src/openllm/entrypoints/__init__.py
+++ b/openllm-python/src/openllm/entrypoints/__init__.py
@@ -1,20 +1,11 @@
 import importlib
-
 from openllm_core.utils import LazyModule
 
 _import_structure = {'openai': [], 'hf': [], 'cohere': []}
-
-
 def mount_entrypoints(svc, llm):
   for module_name in _import_structure:
     module = importlib.import_module(f'.{module_name}', __name__)
     svc = module.mount_to_svc(svc, llm)
   return svc
-
-
-__lazy = LazyModule(
-  __name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
-)
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
diff --git a/openllm-python/src/openllm/entrypoints/cohere.py b/openllm-python/src/openllm/entrypoints/cohere.py
index 7197b54d..1192e3df 100644
--- a/openllm-python/src/openllm/entrypoints/cohere.py
+++ b/openllm-python/src/openllm/entrypoints/cohere.py
@@ -1,17 +1,11 @@
 from __future__ import annotations
-import functools
-import json
-import logging
-import traceback
+import functools, json, logging, traceback
 from http import HTTPStatus
-
 import orjson
 from starlette.applications import Starlette
 from starlette.responses import JSONResponse, StreamingResponse
 from starlette.routing import Route
-
 from openllm_core.utils import DEBUG, converter, gen_random_uuid
-
 from ._openapi import add_schema_definitions, append_schemas, get_generator
 from ..protocol.cohere import (
   Chat,
@@ -54,41 +48,31 @@ schemas = get_generator(
 logger = logging.getLogger(__name__)
 
 
-def jsonify_attr(obj):
-  return json.dumps(converter.unstructure(obj))
-
+def jsonify_attr(obj): return json.dumps(converter.unstructure(obj))
 
 def error_response(status_code, message):
   return JSONResponse(converter.unstructure(CohereErrorResponse(text=message)), status_code=status_code.value)
 
-
 async def check_model(request, model):
-  if request.model is None or request.model == model:
-    return None
+  if request.model is None or request.model == model: return None
   return error_response(
     HTTPStatus.NOT_FOUND,
     f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
   )
 
-
 def mount_to_svc(svc, llm):
   app = Starlette(
     debug=True,
     routes=[
-      Route(
-        '/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
-      ),
-      Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
       Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
+      Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
+      Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
     ],
   )
   mount_path = '/cohere'
 
   svc.mount_asgi_app(app, path=mount_path)
-  return append_schemas(
-    svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
-  )
-
+  return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
 
 @add_schema_definitions
 async def cohere_generate(req, llm):
@@ -181,7 +165,6 @@ def _transpile_cohere_chat_messages(request: CohereChatRequest) -> list[dict[str
   messages.append({'role': 'user', 'content': request.message})
   return messages
 
-
 @add_schema_definitions
 async def cohere_chat(req, llm):
   json_str = await req.body()
diff --git a/openllm-python/src/openllm/entrypoints/hf.py b/openllm-python/src/openllm/entrypoints/hf.py
index d4e9b86f..51f230b8 100644
--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -1,14 +1,10 @@
-import functools
-import logging
+import functools, logging
 from http import HTTPStatus
-
 import orjson
 from starlette.applications import Starlette
 from starlette.responses import JSONResponse
 from starlette.routing import Route
-
 from openllm_core.utils import converter
-
 from ._openapi import add_schema_definitions, append_schemas, get_generator
 from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse
 
@@ -25,7 +21,6 @@ schemas = get_generator(
 )
 logger = logging.getLogger(__name__)
 
-
 def mount_to_svc(svc, llm):
   app = Starlette(
     debug=True,
@@ -39,13 +34,8 @@ def mount_to_svc(svc, llm):
   svc.mount_asgi_app(app, path=mount_path)
   return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')
 
-
 def error_response(status_code, message):
-  return JSONResponse(
-    converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
-    status_code=status_code.value,
-  )
-
+  return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
 
 @add_schema_definitions
 async def hf_agent(req, llm):
@@ -60,18 +50,14 @@ async def hf_agent(req, llm):
   stop = request.parameters.pop('stop', ['\n'])
   try:
     result = await llm.generate(request.inputs, stop=stop, **request.parameters)
-    return JSONResponse(
-      converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
-    )
+    return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
   except Exception as err:
     logger.error('Error while generating: %s', err)
     return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
 
-
 @add_schema_definitions
 def hf_adapters(req, llm):
-  if not llm.has_adapters:
-    return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
+  if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
   return JSONResponse(
     {
       adapter_tuple[1]: {'adapter_name': k, 'adapter_type': adapter_tuple[0].peft_type.value}
diff --git a/openllm-python/src/openllm/exceptions.py b/openllm-python/src/openllm/exceptions.py
index a4c9b07d..3422fe58 100644
--- a/openllm-python/src/openllm/exceptions.py
+++ b/openllm-python/src/openllm/exceptions.py
@@ -1,10 +1,7 @@
 from openllm_core.exceptions import (
-  Error as Error,
-  FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError,
-  ForbiddenAttributeError as ForbiddenAttributeError,
-  GpuNotAvailableError as GpuNotAvailableError,
+  Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError, #
+  ForbiddenAttributeError as ForbiddenAttributeError, GpuNotAvailableError as GpuNotAvailableError, #
+  OpenLLMException as OpenLLMException, ValidationError as ValidationError, #
   MissingAnnotationAttributeError as MissingAnnotationAttributeError,
   MissingDependencyError as MissingDependencyError,
-  OpenLLMException as OpenLLMException,
-  ValidationError as ValidationError,
 )
diff --git a/openllm-python/src/openllm/protocol/__init__.py b/openllm-python/src/openllm/protocol/__init__.py
index 8b6d271e..78c9cbaa 100644
--- a/openllm-python/src/openllm/protocol/__init__.py
+++ b/openllm-python/src/openllm/protocol/__init__.py
@@ -5,11 +5,6 @@ import typing as t
 from openllm_core.utils import LazyModule
 
 _import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}
-
-if t.TYPE_CHECKING:
-  from . import cohere as cohere, hf as hf, openai as openai
-
+if t.TYPE_CHECKING: from . import cohere as cohere, hf as hf, openai as openai
 __lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
diff --git a/openllm-python/src/openllm/utils.py b/openllm-python/src/openllm/utils.py
index da5865c3..ca33da9a 100644
--- a/openllm-python/src/openllm/utils.py
+++ b/openllm-python/src/openllm/utils.py
@@ -1,36 +1,16 @@
 import functools, importlib.metadata, openllm_core
-
 __all__ = ['generate_labels', 'available_devices', 'device_count']
-
-
 def generate_labels(llm):
   return {
-    'backend': llm.__llm_backend__,
-    'framework': 'openllm',
-    'model_name': llm.config['model_name'],
-    'architecture': llm.config['architecture'],
-    'serialisation': llm._serialisation,
+    'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], #
+    'architecture': llm.config['architecture'], 'serialisation': llm._serialisation, #
     **{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
   }
-
-
-def available_devices():
-  from ._strategies import NvidiaGpuResource
-
-  return tuple(NvidiaGpuResource.from_system())
-
-
+def available_devices(): from ._strategies import NvidiaGpuResource; return tuple(NvidiaGpuResource.from_system())
 @functools.lru_cache(maxsize=1)
-def device_count() -> int:
-  return len(available_devices())
-
-
+def device_count() -> int: return len(available_devices())
 def __dir__():
-  coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
-  return sorted(__all__) + sorted(list(coreutils))
-
-
+  coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')]); return sorted(__all__) + sorted(list(coreutils))
 def __getattr__(it):
-  if hasattr(openllm_core.utils, it):
-    return getattr(openllm_core.utils, it)
+  if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it)
   raise AttributeError(f'module {__name__} has no attribute {it}')
diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py
index 9475d704..ab36f90c 100644
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -1,22 +1,11 @@
 from __future__ import annotations
-import itertools
-import logging
-import os
-import re
-import subprocess
-import sys
-import typing as t
-
-import orjson
+import itertools, logging, os, re, subprocess, sys, typing as t
 from simple_di import Provide, inject
-
-import bentoml
-import openllm_core
+import bentoml, openllm_core, orjson
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import LiteralSerialisation
 from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils import WARNING_ENV_VAR, codegen, first_not_none, get_disable_warnings, is_vllm_available
-
 if t.TYPE_CHECKING:
   from bentoml._internal.bento import BentoStore
   from openllm_core._configuration import LLMConfig
@@ -24,7 +13,6 @@ if t.TYPE_CHECKING:
 
 logger = logging.getLogger(__name__)
 
-
 def _start(
   model_id: str,
   timeout: int = 30,
@@ -35,7 +23,6 @@ def _start(
   backend: LiteralBackend | None = None,
   additional_args: list[str] | None = None,
   cors: bool = False,
-  _serve_grpc: bool = False,
   __test__: bool = False,
   **_: t.Any,
 ) -> LLMConfig | subprocess.Popen[bytes]:
@@ -73,13 +60,10 @@ def _start(
     backend: The backend to use for this LLM. By default, this is set to ``pt``.
     additional_args: Additional arguments to pass to ``openllm start``.
   """
-  from .entrypoint import start_command, start_grpc_command
-
+  from .entrypoint import start_command
   os.environ['BACKEND'] = openllm_core.utils.first_not_none(backend, default='vllm' if is_vllm_available() else 'pt')
-
   args: list[str] = [model_id]
-  if timeout:
-    args.extend(['--server-timeout', str(timeout)])
+  if timeout: args.extend(['--server-timeout', str(timeout)])
   if workers_per_resource:
     args.extend(
       [
@@ -87,24 +71,19 @@ def _start(
         str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
       ]
     )
-  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
-    args.extend(['--device', ','.join(device)])
-  if quantize:
-    args.extend(['--quantize', str(quantize)])
-  if cors:
-    args.append('--cors')
+  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
+  if quantize: args.extend(['--quantize', str(quantize)])
+  if cors: args.append('--cors')
   if adapter_map:
     args.extend(
       list(
         itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])
       )
     )
-  if additional_args:
-    args.extend(additional_args)
-  if __test__:
-    args.append('--return-process')
+  if additional_args: args.extend(additional_args)
+  if __test__: args.append('--return-process')
 
-  cmd = start_command if not _serve_grpc else start_grpc_command
+  cmd = start_command
   return cmd.main(args=args, standalone_mode=False)
 
 
@@ -159,7 +138,6 @@ def _build(
       ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
   from openllm.serialisation.transformers.weights import has_safetensors_weights
-
   args: list[str] = [
     sys.executable,
     '-m',
@@ -173,32 +151,19 @@ def _build(
       serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
     ),
   ]
-  if quantize:
-    args.extend(['--quantize', quantize])
-  if containerize and push:
-    raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
-  if push:
-    args.extend(['--push'])
-  if containerize:
-    args.extend(['--containerize'])
-  if build_ctx:
-    args.extend(['--build-ctx', build_ctx])
-  if enable_features:
-    args.extend([f'--enable-features={f}' for f in enable_features])
-  if overwrite:
-    args.append('--overwrite')
-  if adapter_map:
-    args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
-  if model_version:
-    args.extend(['--model-version', model_version])
-  if bento_version:
-    args.extend(['--bento-version', bento_version])
-  if dockerfile_template:
-    args.extend(['--dockerfile-template', dockerfile_template])
-  if additional_args:
-    args.extend(additional_args)
-  if force_push:
-    args.append('--force-push')
+  if quantize: args.extend(['--quantize', quantize])
+  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
+  if push: args.extend(['--push'])
+  if containerize: args.extend(['--containerize'])
+  if build_ctx: args.extend(['--build-ctx', build_ctx])
+  if enable_features: args.extend([f'--enable-features={f}' for f in enable_features])
+  if overwrite: args.append('--overwrite')
+  if adapter_map: args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
+  if model_version: args.extend(['--model-version', model_version])
+  if bento_version: args.extend(['--bento-version', bento_version])
+  if dockerfile_template: args.extend(['--dockerfile-template', dockerfile_template])
+  if additional_args: args.extend(additional_args)
+  if force_push: args.append('--force-push')
 
   current_disable_warning = get_disable_warnings()
   os.environ[WARNING_ENV_VAR] = str(True)
@@ -206,24 +171,17 @@ def _build(
     output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
   except subprocess.CalledProcessError as e:
     logger.error("Exception caught while building Bento for '%s'", model_id, exc_info=e)
-    if e.stderr:
-      raise OpenLLMException(e.stderr.decode('utf-8')) from None
+    if e.stderr: raise OpenLLMException(e.stderr.decode('utf-8')) from None
     raise OpenLLMException(str(e)) from None
   matched = re.match(r'__object__:(\{.*\})$', output.decode('utf-8').strip())
   if matched is None:
-    raise ValueError(
-      f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
-    )
+    raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
   os.environ[WARNING_ENV_VAR] = str(current_disable_warning)
   try:
     result = orjson.loads(matched.group(1))
   except orjson.JSONDecodeError as e:
-    raise ValueError(
-      f"Failed to decode JSON from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
-    ) from e
+    raise ValueError(f"Failed to decode JSON from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") from e
   return bentoml.get(result['tag'], _bento_store=bento_store)
-
-
 def _import_model(
   model_id: str,
   model_version: str | None = None,
@@ -260,32 +218,15 @@ def _import_model(
     ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
   """
   from .entrypoint import import_command
-
   args = [model_id, '--quiet']
-  if backend is not None:
-    args.extend(['--backend', backend])
-  if model_version is not None:
-    args.extend(['--model-version', str(model_version)])
-  if quantize is not None:
-    args.extend(['--quantize', quantize])
-  if serialisation is not None:
-    args.extend(['--serialisation', serialisation])
-  if additional_args is not None:
-    args.extend(additional_args)
+  if backend is not None: args.extend(['--backend', backend])
+  if model_version is not None: args.extend(['--model-version', str(model_version)])
+  if quantize is not None: args.extend(['--quantize', quantize])
+  if serialisation is not None: args.extend(['--serialisation', serialisation])
+  if additional_args is not None: args.extend(additional_args)
   return import_command.main(args=args, standalone_mode=False)
-
-
 def _list_models() -> dict[str, t.Any]:
   '''List all available models within the local store.'''
-  from .entrypoint import models_command
-
-  return models_command.main(args=['--quiet'], standalone_mode=False)
-
-
-start, start_grpc = codegen.gen_sdk(_start, _serve_grpc=False), codegen.gen_sdk(_start, _serve_grpc=True)
-build, import_model, list_models = (
-  codegen.gen_sdk(_build),
-  codegen.gen_sdk(_import_model),
-  codegen.gen_sdk(_list_models),
-)
-__all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
+  from .entrypoint import models_command; return models_command.main(args=['--quiet'], standalone_mode=False)
+start, build, import_model, list_models = codegen.gen_sdk(_start), codegen.gen_sdk(_build), codegen.gen_sdk(_import_model), codegen.gen_sdk(_list_models)
+__all__ = ['start', 'build', 'import_model', 'list_models']