From e3392476be87afbc1d80481dec25580c1fd56aa5 Mon Sep 17 00:00:00 2001
From: Aaron <29749331+aarnphm@users.noreply.github.com>
Date: Fri, 15 Mar 2024 03:47:23 -0400
Subject: [PATCH] revert: "ci: pre-commit autoupdate [pre-commit.ci] (#931)"

This reverts commit 7b00c84c2a0c79825114f0135563eca0d786d923.
---
 .pre-commit-config.yaml                       |   4 +-
 .../src/openllm_client/__init__.pyi           |  24 +-
 openllm-client/src/openllm_client/_http.py    |  28 +-
 openllm-client/src/openllm_client/_schemas.py |  23 +-
 openllm-client/src/openllm_client/_shim.py    |  85 ++----
 openllm-client/src/openllm_client/_stream.py  |   8 +-
 .../src/openllm_client/_typing_compat.py      |   4 +-
 openllm-core/src/openllm_core/__init__.py     |  12 +-
 .../src/openllm_core/_configuration.py        | 252 +++++-------------
 openllm-core/src/openllm_core/_schemas.py     |  15 +-
 .../src/openllm_core/_typing_compat.py        |  11 +-
 .../src/openllm_core/config/__init__.py       |   6 +-
 .../openllm_core/config/configuration_auto.py |  25 +-
 .../config/configuration_dolly_v2.py          |   4 +-
 .../config/configuration_flan_t5.py           |   8 +-
 .../config/configuration_llama.py             |  10 +-
 .../config/configuration_mistral.py           |  10 +-
 .../config/configuration_mixtral.py           |   6 +-
 .../openllm_core/config/configuration_opt.py  |  18 +-
 .../openllm_core/config/configuration_phi.py  |  16 +-
 .../src/openllm_core/utils/__init__.py        |  18 +-
 .../src/openllm_core/utils/codegen.py         |   8 +-
 openllm-core/src/openllm_core/utils/dantic.py |  31 +--
 .../src/openllm_core/utils/import_utils.py    |  13 +-
 openllm-core/src/openllm_core/utils/lazy.py   |  14 +-
 openllm-core/src/openllm_core/utils/peft.py   |  34 +--
 openllm-core/src/openllm_core/utils/pkg.py    |   2 +-
 openllm-core/src/openllm_core/utils/serde.py  |  10 +-
 openllm-python/src/openllm/__init__.py        |  12 +-
 openllm-python/src/openllm/__init__.pyi       |  42 +--
 openllm-python/src/openllm/_deprecated.py     |  13 +-
 openllm-python/src/openllm/_llm.py            | 100 ++-----
 openllm-python/src/openllm/_llm.pyi           |  15 +-
 openllm-python/src/openllm/_quantisation.py   |  12 +-
 openllm-python/src/openllm/_quantisation.pyi  |  16 +-
 openllm-python/src/openllm/_runners.py        |  29 +-
 openllm-python/src/openllm/_runners.pyi       |  23 +-
 openllm-python/src/openllm/_service.py        |  16 +-
 openllm-python/src/openllm/_strategies.py     |  20 +-
 openllm-python/src/openllm/_strategies.pyi    |  11 +-
 openllm-python/src/openllm/bundle/_package.py |  26 +-
 .../src/openllm/bundle/_package.pyi           |   5 +-
 .../src/openllm/entrypoints/__init__.py       |   4 +-
 .../src/openllm/entrypoints/_openapi.py       |  15 +-
 .../src/openllm/entrypoints/_openapi.pyi      |   9 +-
 .../src/openllm/entrypoints/cohere.py         |  25 +-
 .../src/openllm/entrypoints/cohere.pyi        |   4 +-
 openllm-python/src/openllm/entrypoints/hf.py  |   9 +-
 .../src/openllm/entrypoints/openai.py         |  64 +----
 .../src/openllm/entrypoints/openai.pyi        |   4 +-
 .../src/openllm/serialisation/__init__.py     |   4 +-
 .../src/openllm/serialisation/_helpers.py     |  17 +-
 .../src/openllm/serialisation/_helpers.pyi    |   4 +-
 .../serialisation/ctranslate/__init__.py      |  18 +-
 .../serialisation/transformers/__init__.py    |  25 +-
 .../serialisation/transformers/_helpers.py    |   4 +-
 openllm-python/src/openllm/utils.py           |   2 +-
 openllm-python/src/openllm_cli/_factory.py    |  82 ++----
 openllm-python/src/openllm_cli/_sdk.py        |  19 +-
 openllm-python/src/openllm_cli/entrypoint.py  | 204 +++-----------
 .../src/openllm_cli/extension/dive_bentos.py  |   4 +-
 .../extension/get_containerfile.py            |   4 +-
 .../src/openllm_cli/extension/get_prompt.py   |  32 +--
 .../src/openllm_cli/extension/list_models.py  |   9 +-
 .../src/openllm_cli/extension/playground.py   |  17 +-
 openllm-python/src/openllm_cli/termui.py      |  17 +-
 openllm-python/tests/configuration_test.py    |  11 +-
 openllm-python/tests/conftest.py              |  14 +-
 openllm-python/tests/strategies_test.py       |   8 +-
 69 files changed, 368 insertions(+), 1300 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0e3515ab..ba3d8cd9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ default_language_version:
 exclude: '.*\.(css|js|svg)$'
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.3.2'
+    rev: 'v0.2.2'
     hooks:
       - id: ruff
         alias: r
@@ -66,7 +66,7 @@ repos:
         exclude: ^(docs|tools|openllm-python/tests)
         args: [--config=pyproject.toml]
   - repo: https://github.com/pre-commit/mirrors-eslint
-    rev: v9.0.0-beta.2
+    rev: v9.0.0-beta.0
     hooks:
       - id: eslint
         verbose: true
diff --git a/openllm-client/src/openllm_client/__init__.pyi b/openllm-client/src/openllm_client/__init__.pyi
index bec8fc11..3b5ecfb3 100644
--- a/openllm-client/src/openllm_client/__init__.pyi
+++ b/openllm-client/src/openllm_client/__init__.pyi
@@ -15,17 +15,11 @@ class HTTPClient:
   address: str
   helpers: _Helpers
   @overload
-  def __init__(
-    self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
-  ) -> None: ...
+  def __init__(self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
   @overload
-  def __init__(
-    self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
-  ) -> None: ...
+  def __init__(self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
   @overload
-  def __init__(
-    self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
-  ) -> None: ...
+  def __init__(self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
   @property
   def is_ready(self) -> bool: ...
   def health(self) -> bool: ...
@@ -66,17 +60,11 @@ class AsyncHTTPClient:
   address: str
   helpers: _AsyncHelpers
   @overload
-  def __init__(
-    self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
-  ) -> None: ...
+  def __init__(self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
   @overload
-  def __init__(
-    self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
-  ) -> None: ...
+  def __init__(self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
   @overload
-  def __init__(
-    self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
-  ) -> None: ...
+  def __init__(self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
   @property
   def is_ready(self) -> bool: ...
   async def health(self) -> bool: ...
diff --git a/openllm-client/src/openllm_client/_http.py b/openllm-client/src/openllm_client/_http.py
index ed802a67..9923468e 100644
--- a/openllm-client/src/openllm_client/_http.py
+++ b/openllm-client/src/openllm_client/_http.py
@@ -70,14 +70,10 @@ class HTTPClient(Client):
     return self.generate(prompt, **attrs)
 
   def health(self):
-    response = self._get(
-      '/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries}
-    )
+    response = self._get('/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries})
     return response.status_code == 200
 
-  def generate(
-    self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
-  ) -> Response:
+  def generate(self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs) -> Response:
     if timeout is None:
       timeout = self._timeout
     if verify is None:
@@ -100,9 +96,7 @@ class HTTPClient(Client):
     for response_chunk in self.generate_iterator(prompt, llm_config, stop, adapter_name, timeout, verify, **attrs):
       yield StreamingResponse.from_response_chunk(response_chunk)
 
-  def generate_iterator(
-    self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
-  ) -> t.Iterator[Response]:
+  def generate_iterator(self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs) -> t.Iterator[Response]:
     if timeout is None:
       timeout = self._timeout
     if verify is None:
@@ -152,9 +146,7 @@ class AsyncHTTPClient(AsyncClient):
   @property
   async def _metadata(self) -> t.Awaitable[Metadata]:
     if self.__metadata is None:
-      self.__metadata = await self._post(
-        f'/{self._api_version}/metadata', response_cls=Metadata, json={}, options={'max_retries': self._max_retries}
-      )
+      self.__metadata = await self._post(f'/{self._api_version}/metadata', response_cls=Metadata, json={}, options={'max_retries': self._max_retries})
     return self.__metadata
 
   @property
@@ -167,14 +159,10 @@ class AsyncHTTPClient(AsyncClient):
     return await self.generate(prompt, **attrs)
 
   async def health(self):
-    response = await self._get(
-      '/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries}
-    )
+    response = await self._get('/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries})
     return response.status_code == 200
 
-  async def generate(
-    self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
-  ) -> Response:
+  async def generate(self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs) -> Response:
     if timeout is None:
       timeout = self._timeout
     if verify is None:
@@ -195,9 +183,7 @@ class AsyncHTTPClient(AsyncClient):
   async def generate_stream(
     self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
   ) -> t.AsyncGenerator[StreamingResponse, t.Any]:
-    async for response_chunk in self.generate_iterator(
-      prompt, llm_config, stop, adapter_name, timeout, verify, **attrs
-    ):
+    async for response_chunk in self.generate_iterator(prompt, llm_config, stop, adapter_name, timeout, verify, **attrs):
       yield StreamingResponse.from_response_chunk(response_chunk)
 
   async def generate_iterator(
diff --git a/openllm-client/src/openllm_client/_schemas.py b/openllm-client/src/openllm_client/_schemas.py
index 1723d583..037c92b3 100644
--- a/openllm-client/src/openllm_client/_schemas.py
+++ b/openllm-client/src/openllm_client/_schemas.py
@@ -17,7 +17,7 @@ if t.TYPE_CHECKING:
   from ._shim import AsyncClient, Client
 
 
-__all__ = ['CompletionChunk', 'Helpers', 'Metadata', 'Response', 'StreamingResponse']
+__all__ = ['Response', 'CompletionChunk', 'Metadata', 'StreamingResponse', 'Helpers']
 
 
 @attr.define
@@ -42,11 +42,7 @@ def _structure_metadata(data: t.Dict[str, t.Any], cls: type[Metadata]) -> Metada
     raise RuntimeError(f'Malformed metadata configuration (Server-side issue): {e}') from None
   try:
     return cls(
-      model_id=data['model_id'],
-      timeout=data['timeout'],
-      model_name=data['model_name'],
-      backend=data['backend'],
-      configuration=configuration,
+      model_id=data['model_id'], timeout=data['timeout'], model_name=data['model_name'], backend=data['backend'], configuration=configuration
     )
   except Exception as e:
     raise RuntimeError(f'Malformed metadata (Server-side issue): {e}') from None
@@ -65,10 +61,7 @@ class StreamingResponse(_SchemaMixin):
   @classmethod
   def from_response_chunk(cls, response: Response) -> StreamingResponse:
     return cls(
-      request_id=response.request_id,
-      index=response.outputs[0].index,
-      text=response.outputs[0].text,
-      token_ids=response.outputs[0].token_ids[0],
+      request_id=response.request_id, index=response.outputs[0].index, text=response.outputs[0].text, token_ids=response.outputs[0].token_ids[0]
     )
 
 
@@ -95,17 +88,11 @@ class Helpers:
     return self._async_client
 
   def messages(self, messages, add_generation_prompt=False):
-    return self.client._post(
-      '/v1/helpers/messages',
-      response_cls=str,
-      json=dict(messages=messages, add_generation_prompt=add_generation_prompt),
-    )
+    return self.client._post('/v1/helpers/messages', response_cls=str, json=dict(messages=messages, add_generation_prompt=add_generation_prompt))
 
   async def async_messages(self, messages, add_generation_prompt=False):
     return await self.async_client._post(
-      '/v1/helpers/messages',
-      response_cls=str,
-      json=dict(messages=messages, add_generation_prompt=add_generation_prompt),
+      '/v1/helpers/messages', response_cls=str, json=dict(messages=messages, add_generation_prompt=add_generation_prompt)
     )
 
   @classmethod
diff --git a/openllm-client/src/openllm_client/_shim.py b/openllm-client/src/openllm_client/_shim.py
index 4c7470e7..04a9a730 100644
--- a/openllm-client/src/openllm_client/_shim.py
+++ b/openllm-client/src/openllm_client/_shim.py
@@ -140,9 +140,7 @@ class APIResponse(t.Generic[Response]):
 
     data = self._raw_response.json()
     try:
-      return self._client._process_response_data(
-        data=data, response_cls=self._response_cls, raw_response=self._raw_response
-      )
+      return self._client._process_response_data(data=data, response_cls=self._response_cls, raw_response=self._raw_response)
     except Exception as exc:
       raise ValueError(exc) from None  # validation error here
 
@@ -273,16 +271,10 @@ class BaseClient(t.Generic[InnerClient, StreamType]):
 
   def _build_request(self, options: RequestOptions) -> httpx.Request:
     return self._inner.build_request(
-      method=options.method,
-      headers=self._build_headers(options),
-      url=self._prepare_url(options.url),
-      json=options.json,
-      params=options.params,
+      method=options.method, headers=self._build_headers(options), url=self._prepare_url(options.url), json=options.json, params=options.params
     )
 
-  def _calculate_retry_timeout(
-    self, remaining_retries: int, options: RequestOptions, headers: t.Optional[httpx.Headers] = None
-  ) -> float:
+  def _calculate_retry_timeout(self, remaining_retries: int, options: RequestOptions, headers: t.Optional[httpx.Headers] = None) -> float:
     max_retries = options.get_max_retries(self._max_retries)
     # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After
     try:
@@ -323,9 +315,7 @@ class BaseClient(t.Generic[InnerClient, StreamType]):
       return True
     return False
 
-  def _process_response_data(
-    self, *, response_cls: type[Response], data: t.Dict[str, t.Any], raw_response: httpx.Response
-  ) -> Response:
+  def _process_response_data(self, *, response_cls: type[Response], data: t.Dict[str, t.Any], raw_response: httpx.Response) -> Response:
     return converter.structure(data, response_cls)
 
   def _process_response(
@@ -338,24 +328,13 @@ class BaseClient(t.Generic[InnerClient, StreamType]):
     stream_cls: type[_Stream] | type[_AsyncStream] | None,
   ) -> Response:
     return APIResponse(
-      raw_response=raw_response,
-      client=self,
-      response_cls=response_cls,
-      stream=stream,
-      stream_cls=stream_cls,
-      options=options,
+      raw_response=raw_response, client=self, response_cls=response_cls, stream=stream, stream_cls=stream_cls, options=options
     ).parse()
 
 
 @attr.define(init=False)
 class Client(BaseClient[httpx.Client, Stream[t.Any]]):
-  def __init__(
-    self,
-    base_url: str | httpx.URL,
-    version: str,
-    timeout: int | httpx.Timeout = DEFAULT_TIMEOUT,
-    max_retries: int = MAX_RETRIES,
-  ):
+  def __init__(self, base_url: str | httpx.URL, version: str, timeout: int | httpx.Timeout = DEFAULT_TIMEOUT, max_retries: int = MAX_RETRIES):
     super().__init__(
       base_url=base_url,
       version=version,
@@ -387,13 +366,7 @@ class Client(BaseClient[httpx.Client, Stream[t.Any]]):
     stream: bool = False,
     stream_cls: type[_Stream] | None = None,
   ) -> Response | _Stream:
-    return self._request(
-      response_cls=response_cls,
-      options=options,
-      remaining_retries=remaining_retries,
-      stream=stream,
-      stream_cls=stream_cls,
-    )
+    return self._request(response_cls=response_cls, options=options, remaining_retries=remaining_retries, stream=stream, stream_cls=stream_cls)
 
   def _request(
     self,
@@ -412,9 +385,7 @@ class Client(BaseClient[httpx.Client, Stream[t.Any]]):
       response.raise_for_status()
     except httpx.HTTPStatusError as exc:
       if retries > 0 and self._should_retry(exc.response):
-        return self._retry_request(
-          response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls
-        )
+        return self._retry_request(response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls)
       # If the response is streamed then we need to explicitly read the completed response
       exc.response.read()
       raise ValueError(exc.message) from None
@@ -427,9 +398,7 @@ class Client(BaseClient[httpx.Client, Stream[t.Any]]):
         return self._retry_request(response_cls, options, retries, stream=stream, stream_cls=stream_cls)
       raise ValueError(request) from None  # connection error
 
-    return self._process_response(
-      response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls
-    )
+    return self._process_response(response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls)
 
   def _retry_request(
     self,
@@ -459,9 +428,7 @@ class Client(BaseClient[httpx.Client, Stream[t.Any]]):
   ) -> Response | _Stream:
     if options is None:
       options = {}
-    return self.request(
-      response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls
-    )
+    return self.request(response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls)
 
   def _post(
     self,
@@ -475,20 +442,12 @@ class Client(BaseClient[httpx.Client, Stream[t.Any]]):
   ) -> Response | _Stream:
     if options is None:
       options = {}
-    return self.request(
-      response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls
-    )
+    return self.request(response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls)
 
 
 @attr.define(init=False)
 class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
-  def __init__(
-    self,
-    base_url: str | httpx.URL,
-    version: str,
-    timeout: int | httpx.Timeout = DEFAULT_TIMEOUT,
-    max_retries: int = MAX_RETRIES,
-  ):
+  def __init__(self, base_url: str | httpx.URL, version: str, timeout: int | httpx.Timeout = DEFAULT_TIMEOUT, max_retries: int = MAX_RETRIES):
     super().__init__(
       base_url=base_url,
       version=version,
@@ -527,9 +486,7 @@ class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
     stream: bool = False,
     stream_cls: type[_AsyncStream] | None = None,
   ) -> Response | _AsyncStream:
-    return await self._request(
-      response_cls, options, remaining_retries=remaining_retries, stream=stream, stream_cls=stream_cls
-    )
+    return await self._request(response_cls, options, remaining_retries=remaining_retries, stream=stream, stream_cls=stream_cls)
 
   async def _request(
     self,
@@ -549,9 +506,7 @@ class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
       response.raise_for_status()
     except httpx.HTTPStatusError as exc:
       if retries > 0 and self._should_retry(exc.response):
-        return self._retry_request(
-          response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls
-        )
+        return self._retry_request(response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls)
       # If the response is streamed then we need to explicitly read the completed response
       await exc.response.aread()
       raise ValueError(exc.message) from None
@@ -571,9 +526,7 @@ class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
         return await self._retry_request(response_cls, options, retries, stream=stream, stream_cls=stream_cls)
       raise ValueError(request) from err  # connection error
 
-    return self._process_response(
-      response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls
-    )
+    return self._process_response(response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls)
 
   async def _retry_request(
     self,
@@ -602,9 +555,7 @@ class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
   ) -> Response | _AsyncStream:
     if options is None:
       options = {}
-    return await self.request(
-      response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls
-    )
+    return await self.request(response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls)
 
   async def _post(
     self,
@@ -618,6 +569,4 @@ class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
   ) -> Response | _AsyncStream:
     if options is None:
       options = {}
-    return await self.request(
-      response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls
-    )
+    return await self.request(response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls)
diff --git a/openllm-client/src/openllm_client/_stream.py b/openllm-client/src/openllm_client/_stream.py
index e81a7fb0..a5103207 100644
--- a/openllm-client/src/openllm_client/_stream.py
+++ b/openllm-client/src/openllm_client/_stream.py
@@ -38,9 +38,7 @@ class Stream(t.Generic[Response]):
       if sse.data.startswith('[DONE]'):
         break
       if sse.event is None:
-        yield self._client._process_response_data(
-          data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response
-        )
+        yield self._client._process_response_data(data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response)
 
 
 @attr.define(auto_attribs=True)
@@ -71,9 +69,7 @@ class AsyncStream(t.Generic[Response]):
       if sse.data.startswith('[DONE]'):
         break
       if sse.event is None:
-        yield self._client._process_response_data(
-          data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response
-        )
+        yield self._client._process_response_data(data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response)
 
 
 @attr.define
diff --git a/openllm-client/src/openllm_client/_typing_compat.py b/openllm-client/src/openllm_client/_typing_compat.py
index 15d86f8a..48bd0a85 100644
--- a/openllm-client/src/openllm_client/_typing_compat.py
+++ b/openllm-client/src/openllm_client/_typing_compat.py
@@ -11,7 +11,5 @@ from openllm_core._typing_compat import (
   overload as overload,
 )
 
-Platform = Annotated[
-  LiteralString, Literal['MacOS', 'Linux', 'Windows', 'FreeBSD', 'OpenBSD', 'iOS', 'iPadOS', 'Android', 'Unknown'], str
-]
+Platform = Annotated[LiteralString, Literal['MacOS', 'Linux', 'Windows', 'FreeBSD', 'OpenBSD', 'iOS', 'iPadOS', 'Android', 'Unknown'], str]
 Architecture = Annotated[LiteralString, Literal['arm', 'arm64', 'x86', 'x86_64', 'Unknown'], str]
diff --git a/openllm-core/src/openllm_core/__init__.py b/openllm-core/src/openllm_core/__init__.py
index aea63eb5..bb5414db 100644
--- a/openllm-core/src/openllm_core/__init__.py
+++ b/openllm-core/src/openllm_core/__init__.py
@@ -1,14 +1,6 @@
 from . import exceptions as exceptions, utils as utils
-from ._configuration import (
-  GenerationConfig as GenerationConfig,
-  LLMConfig as LLMConfig,
-  SamplingParams as SamplingParams,
-)
-from ._schemas import (
-  GenerationInput as GenerationInput,
-  GenerationOutput as GenerationOutput,
-  MetadataOutput as MetadataOutput,
-)
+from ._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
+from ._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput
 from .config import (
   CONFIG_MAPPING as CONFIG_MAPPING,
   CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
index c48e797e..18a31ec3 100644
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -35,12 +35,7 @@ from ._typing_compat import (
   T,
   overload,
 )
-from .exceptions import (
-  ForbiddenAttributeError,
-  MissingDependencyError,
-  MissingAnnotationAttributeError,
-  ValidationError,
-)
+from .exceptions import ForbiddenAttributeError, MissingDependencyError, MissingAnnotationAttributeError, ValidationError
 from .utils import LazyLoader, ReprMixin, codegen, converter, dantic, field_env_key, first_not_none, lenient_issubclass
 from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING, FineTuneConfig
 
@@ -56,16 +51,11 @@ if t.TYPE_CHECKING:
 
   from ._schemas import MessageParam
 else:
-  vllm = LazyLoader(
-    'vllm',
-    globals(),
-    'vllm',
-    exc_msg='vLLM is not installed. Make sure to install it with `pip install "openllm[vllm]"`',
-  )
+  vllm = LazyLoader('vllm', globals(), 'vllm', exc_msg='vLLM is not installed. Make sure to install it with `pip install "openllm[vllm]"`')
   transformers = LazyLoader('transformers', globals(), 'transformers')
   peft = LazyLoader('peft', globals(), 'peft')
 
-__all__ = ['GenerationConfig', 'LLMConfig', 'SamplingParams', 'field_env_key']
+__all__ = ['LLMConfig', 'GenerationConfig', 'SamplingParams', 'field_env_key']
 
 logger = logging.getLogger(__name__)
 config_merger = Merger([(dict, 'merge')], ['override'], ['override'])
@@ -74,17 +64,13 @@ _object_setattr = object.__setattr__
 
 @attr.frozen(slots=True, repr=False, init=False)
 class GenerationConfig(ReprMixin):
-  max_new_tokens: int = dantic.Field(
-    20, ge=0, description='The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.'
-  )
+  max_new_tokens: int = dantic.Field(20, ge=0, description='The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
   min_length: int = dantic.Field(
     0,
     ge=0,  #
     description='The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.',
   )
-  min_new_tokens: int = dantic.Field(
-    description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.'
-  )
+  min_new_tokens: int = dantic.Field(description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
   early_stopping: bool = dantic.Field(
     False,
     description="Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; `'never'`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm) ",
@@ -101,15 +87,10 @@ class GenerationConfig(ReprMixin):
     description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.'
   )
   use_cache: bool = dantic.Field(
-    True,
-    description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.',
-  )
-  temperature: float = dantic.Field(
-    1.0, ge=0.0, le=1.0, description='The value used to modulate the next token probabilities.'
-  )
-  top_k: int = dantic.Field(
-    50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.'
+    True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.'
   )
+  temperature: float = dantic.Field(1.0, ge=0.0, le=1.0, description='The value used to modulate the next token probabilities.')
+  top_k: int = dantic.Field(50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.')
   top_p: float = dantic.Field(
     1.0,
     description='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.',
@@ -142,9 +123,7 @@ class GenerationConfig(ReprMixin):
     1.0,
     description='Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences.',
   )
-  no_repeat_ngram_size: int = dantic.Field(
-    0, description='If set to int > 0, all ngrams of that size can only occur once.'
-  )
+  no_repeat_ngram_size: int = dantic.Field(0, description='If set to int > 0, all ngrams of that size can only occur once.')
   bad_words_ids: t.List[t.List[int]] = dantic.Field(
     description='List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.'
   )
@@ -177,20 +156,16 @@ class GenerationConfig(ReprMixin):
   forced_decoder_ids: t.List[t.List[int]] = dantic.Field(
     description='A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123.'
   )
-  num_return_sequences: int = dantic.Field(
-    1, description='The number of independently computed returned sequences for each element in the batch.'
-  )
+  num_return_sequences: int = dantic.Field(1, description='The number of independently computed returned sequences for each element in the batch.')
   output_attentions: bool = dantic.Field(
     False,
     description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.',
   )
   output_hidden_states: bool = dantic.Field(
-    False,
-    description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.',
+    False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'
   )
   output_scores: bool = dantic.Field(
-    False,
-    description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.',
+    False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.'
   )
   pad_token_id: int = dantic.Field(description='The id of the *padding* token.')
   bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.')
@@ -198,23 +173,18 @@ class GenerationConfig(ReprMixin):
     description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.'
   )
   encoder_no_repeat_ngram_size: int = dantic.Field(
-    0,
-    description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.',
+    0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.'
   )
   decoder_start_token_id: int = dantic.Field(
     description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.'
   )
   # NOTE: This is now implemented and supported for both PyTorch and vLLM
   logprobs: t.Optional[int] = dantic.Field(None, description='Number of log probabilities to return per output token.')
-  prompt_logprobs: t.Optional[int] = dantic.Field(
-    None, description='Number of log probabilities to return per input token.'
-  )
+  prompt_logprobs: t.Optional[int] = dantic.Field(None, description='Number of log probabilities to return per input token.')
 
   def __init__(self, *, _internal: bool = False, **attrs: t.Any):
     if not _internal:
-      raise RuntimeError(
-        'GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config'
-      )
+      raise RuntimeError('GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config')
     self.__attrs_init__(**attrs)
 
   def __getitem__(self, item: str) -> t.Any:
@@ -265,8 +235,7 @@ class SamplingParams(ReprMixin):
   )
   use_beam_search: bool = dantic.Field(False, description='Whether to use beam search instead of sampling.')
   ignore_eos: bool = dantic.Field(
-    False,
-    description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.',
+    False, description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.'
   )
   skip_special_tokens: bool = dantic.Field(True, description='Whether to skip special tokens in the generated output.')
   # space_between_special_tokens: bool = dantic.Field(True, description='Whether to add a space between special tokens in the generated output.')
@@ -285,9 +254,7 @@ class SamplingParams(ReprMixin):
 
   def __init__(self, *, _internal: bool = False, **attrs: t.Any):
     if not _internal:
-      raise RuntimeError(
-        'SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config.'
-      )
+      raise RuntimeError('SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config.')
     _object_setattr(self, 'max_tokens', attrs.pop('max_tokens', 16))
     _object_setattr(self, 'temperature', attrs.pop('temperature', 1.0))
     _object_setattr(self, 'top_k', attrs.pop('top_k', -1))
@@ -332,12 +299,8 @@ class SamplingParams(ReprMixin):
     temperature = first_not_none(attrs.pop('temperature', None), default=generation_config['temperature'])
     top_k = first_not_none(attrs.pop('top_k', None), default=generation_config['top_k'])
     top_p = first_not_none(attrs.pop('top_p', None), default=generation_config['top_p'])
-    max_tokens = first_not_none(
-      attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens']
-    )
-    repetition_penalty = first_not_none(
-      attrs.pop('repetition_penalty', None), default=generation_config['repetition_penalty']
-    )
+    max_tokens = first_not_none(attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens'])
+    repetition_penalty = first_not_none(attrs.pop('repetition_penalty', None), default=generation_config['repetition_penalty'])
     length_penalty = first_not_none(attrs.pop('length_penalty', None), default=generation_config['length_penalty'])
     early_stopping = first_not_none(attrs.pop('early_stopping', None), default=generation_config['early_stopping'])
     logprobs = first_not_none(attrs.pop('logprobs', None), default=generation_config['logprobs'])
@@ -372,16 +335,12 @@ converter.register_unstructure_hook_factory(
     converter,
     _cattrs_omit_if_default=False,
     _cattrs_use_linecache=True,
-    **{
-      k: override(omit_if_default=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
-    },
+    **{k: override(omit_if_default=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)},
   ),
 )
 converter.register_structure_hook_factory(
   lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
-  lambda cls: make_dict_structure_fn(
-    cls, converter, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')
-  ),
+  lambda cls: make_dict_structure_fn(cls, converter, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')),
 )
 
 _SamplingParamsT = t.TypeVar('_SamplingParamsT', bound=SamplingParams)
@@ -505,8 +464,7 @@ def structure_settings(cls: type[LLMConfig], _: type[_ModelSettingsAttr]) -> _Mo
   _attr.update({
     'service_name': f'generated_{_attr["model_name"] if "model_name" in _attr else _config.model_name}_service.py',
     'fine_tune_strategies': {
-      ft_config.get('adapter_type', 'lora'): FineTuneConfig.from_config(ft_config, cls)
-      for ft_config in _config.fine_tune_strategies
+      ft_config.get('adapter_type', 'lora'): FineTuneConfig.from_config(ft_config, cls) for ft_config in _config.fine_tune_strategies
     }
     if _config.fine_tune_strategies
     else {},
@@ -518,14 +476,9 @@ converter.register_structure_hook(_ModelSettingsAttr, structure_settings)
 
 _reserved_namespace = {'__config__', 'GenerationConfig', 'SamplingParams'}
 
+def _setattr_class(attr_name: str, value_var: t.Any) -> str: return f"setattr(cls, '{attr_name}', {value_var})"
 
-def _setattr_class(attr_name: str, value_var: t.Any) -> str:
-  return f"setattr(cls, '{attr_name}', {value_var})"
-
-
-def _make_assignment_script(
-  cls: type[LLMConfig], attributes: attr.AttrsInstance
-) -> t.Callable[[type[LLMConfig]], None]:
+def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance) -> t.Callable[[type[LLMConfig]], None]:
   args, lines, annotations = [], [], {'return': None}
   globs = {'cls': cls, '_cached_attribute': attributes}
   for attr_name, field in attr.fields_dict(attributes.__class__).items():
@@ -539,9 +492,7 @@ def _make_assignment_script(
 @attr.define(slots=True)
 class _ConfigAttr(t.Generic[_GenerationConfigT, _SamplingParamsT]):
   @staticmethod
-  def Field(default: t.Any = None, **attrs: t.Any) -> t.Any:
-    return dantic.Field(default, **attrs)
-
+  def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: return dantic.Field(default, **attrs)
   if t.TYPE_CHECKING:
     __config__: t.ClassVar[ModelSettings] = Field(None)
     GenerationConfig: _GenerationConfigT = Field(None)
@@ -556,22 +507,22 @@ class _ConfigAttr(t.Generic[_GenerationConfigT, _SamplingParamsT]):
 
     # update-config-stubs.py: special start
     __openllm_default_id__: str = Field(None)
-    """Return the default model to use when using 'openllm start <model_id>'.
+    '''Return the default model to use when using 'openllm start <model_id>'.
         This could be one of the keys in 'self.model_ids' or custom users model.
 
         This field is required when defining under '__config__'.
-        """
+        '''
     __openllm_model_ids__: ListStr = Field(None)
-    """A list of supported pretrained models tag for this given runnable.
+    '''A list of supported pretrained models tag for this given runnable.
 
         For example:
             For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
                                             "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
 
         This field is required when defining under '__config__'.
-        """
+        '''
     __openllm_architecture__: str = Field(None)
-    """The model architecture that is supported by this LLM.
+    '''The model architecture that is supported by this LLM.
 
         Note that any model weights within this architecture generation can always be run and supported by this LLM.
 
@@ -580,33 +531,33 @@ class _ConfigAttr(t.Generic[_GenerationConfigT, _SamplingParamsT]):
 
             ```bash
             openllm start stabilityai/stablelm-tuned-alpha-3b
-            ```"""
+            ```'''
     __openllm_url__: str = Field(None)
-    """The resolved url for this LLMConfig."""
+    '''The resolved url for this LLMConfig.'''
     __openllm_serialisation__: LiteralSerialisation = Field(None)
-    """Default serialisation format for different models. Some will default to use the legacy 'bin'. """
+    '''Default serialisation format for different models. Some will default to use the legacy 'bin'. '''
     __openllm_trust_remote_code__: bool = Field(None)
-    """Whether to always trust remote code"""
+    '''Whether to always trust remote code'''
     __openllm_service_name__: str = Field(None)
     '''Generated service name for this LLMConfig. By default, it is "generated_{model_name}_service.py"'''
     __openllm_requirements__: t.Optional[ListStr] = Field(None)
-    """The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers."""
+    '''The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.'''
     __openllm_model_type__: t.Literal['causal_lm', 'seq2seq_lm'] = Field(None)
     '''The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"'''
     __openllm_name_type__: t.Optional[t.Literal['dasherize', 'lowercase']] = Field(None)
-    """The default name typed for this model. "dasherize" will convert the name to lowercase and
+    '''The default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
-        `model_name` and `start_name` must be specified."""
+        `model_name` and `start_name` must be specified.'''
     __openllm_backend__: t.Tuple[LiteralBackend, ...] = Field(None)
-    """List of supported backend for this given LLM class. Currently, we support "pt" and "vllm"."""
+    '''List of supported backend for this given LLM class. Currently, we support "pt" and "vllm".'''
     __openllm_model_name__: str = Field(None)
-    """The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
+    '''The normalized version of __openllm_start_name__, determined by __openllm_name_type__'''
     __openllm_start_name__: str = Field(None)
-    """Default name to be used with `openllm start`"""
+    '''Default name to be used with `openllm start`'''
     __openllm_timeout__: int = Field(None)
-    """The default timeout to be set for this given LLM."""
+    '''The default timeout to be set for this given LLM.'''
     __openllm_workers_per_resource__: t.Union[int, float] = Field(None)
-    """The number of workers per resource. This is used to determine the number of workers to use for this model.
+    '''The number of workers per resource. This is used to determine the number of workers to use for this model.
         For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
         OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
 
@@ -614,40 +565,20 @@ class _ConfigAttr(t.Generic[_GenerationConfigT, _SamplingParamsT]):
         https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
 
         By default, it is set to 1.
-        """
+        '''
     __openllm_fine_tune_strategies__: t.Dict[AdapterType, FineTuneConfig] = Field(None)
-    """The fine-tune strategies for this given LLM."""
+    '''The fine-tune strategies for this given LLM.'''
     # update-config-stubs.py: special stop
 
 
 class _ConfigBuilder:
-  __slots__ = (
-    '_attr_names',
-    '_attrs',
-    '_base_attr_map',
-    '_base_names',
-    '_cls',
-    '_cls_dict',
-    '_has_post_init',
-    '_has_pre_init',
-    '_model_name',
-  )
+  __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', '_has_pre_init', '_has_post_init')
 
   def __init__(
-    self,
-    cls: type[LLMConfig],
-    these: dict[str, _CountingAttr],
-    auto_attribs: bool = False,
-    kw_only: bool = False,
-    collect_by_mro: bool = True,
+    self, cls: type[LLMConfig], these: dict[str, _CountingAttr], auto_attribs: bool = False, kw_only: bool = False, collect_by_mro: bool = True
   ):
     attrs, base_attrs, base_attr_map = _transform_attrs(
-      cls,
-      these,
-      auto_attribs,
-      kw_only,
-      collect_by_mro,
-      field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__),
+      cls, these, auto_attribs, kw_only, collect_by_mro, field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__)
     )
     self._cls, self._model_name, self._cls_dict = cls, cls.__openllm_model_name__, dict(cls.__dict__)
     self._attrs = attrs
@@ -676,15 +607,11 @@ class _ConfigBuilder:
     for base_cls in self._cls.__mro__[1:-1]:
       if base_cls.__dict__.get('__weakref__', None) is not None:
         weakref_inherited = True
-      existing_slots.update({
-        name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, '__slots__', [])
-      })
+      existing_slots.update({name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, '__slots__', [])})
 
     names = self._attr_names
     base_names = set(self._base_names)
-    if (
-      '__weakref__' not in getattr(self._cls, '__slots__', ()) and '__weakref__' not in names and not weakref_inherited
-    ):
+    if '__weakref__' not in getattr(self._cls, '__slots__', ()) and '__weakref__' not in names and not weakref_inherited:
       names += ('__weakref__',)
     # We only add the names of attributes that aren't inherited.
     # Setting __slots__ to inherited attributes wastes memory.
@@ -766,9 +693,7 @@ class _ConfigBuilder:
     for key, fn in ReprMixin.__dict__.items():
       if key in ('__repr__', '__str__', '__repr_name__', '__repr_str__', '__repr_args__'):
         self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
-    self._cls_dict['__repr_keys__'] = property(
-      lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'}
-    )
+    self._cls_dict['__repr_keys__'] = property(lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'})
     return self
 
 
@@ -779,15 +704,12 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       logger.warning("LLMConfig subclass should end with 'Config'. Updating to %sConfig", cls.__name__)
       cls.__name__ = f'{cls.__name__}Config'
 
-    if not hasattr(cls, '__config__'):
-      raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
+    if not hasattr(cls, '__config__'): raise RuntimeError("Given LLMConfig must have '__config__' that is not None defined.")
 
     # auto assignment attributes generated from __config__ after create the new slot class.
     _make_assignment_script(cls, converter.structure(cls, _ModelSettingsAttr))(cls)
 
-    def _make_subclass(
-      class_attr: str, base: type[At], globs: dict[str, t.Any] | None = None, suffix_env: LiteralString | None = None
-    ) -> type[At]:
+    def _make_subclass(class_attr: str, base: type[At], globs: dict[str, t.Any] | None = None, suffix_env: LiteralString | None = None) -> type[At]:
       camel_name = cls.__name__.replace('Config', '')
       klass = attr.make_class(
         f'{camel_name}{class_attr}',
@@ -804,9 +726,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
           cls.__openllm_model_name__,
           suffix=suffix_env,
           globs=globs,
-          default_callback=lambda field_name, field_default: getattr(
-            getattr(cls, class_attr), field_name, field_default
-          )
+          default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default)
           if codegen.has_own_attribute(cls, class_attr)
           else field_default,
         ),
@@ -866,11 +786,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
     # the hint cache for easier access
     cls.__openllm_hints__ = {
       f.name: f.type
-      for ite in [
-        attr.fields(cls),
-        attr.fields(cls.__openllm_generation_class__),
-        attr.fields(cls.__openllm_sampling_class__),
-      ]
+      for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__), attr.fields(cls.__openllm_sampling_class__)]
       for f in ite
     }
 
@@ -903,24 +819,19 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
     if generation_config is None:
       generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
     else:
-      generation_config = config_merger.merge(
-        generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict}
-      )
+      generation_config = config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})
 
     if sampling_config is None:
       sampling_config = {k: v for k, v in attrs.items() if k in _sampling_cl_dict}
     else:
-      sampling_config = config_merger.merge(
-        sampling_config, {k: v for k, v in attrs.items() if k in _sampling_cl_dict}
-      )
+      sampling_config = config_merger.merge(sampling_config, {k: v for k, v in attrs.items() if k in _sampling_cl_dict})
     for k in _cached_keys:
       if k in generation_config or k in sampling_config or attrs[k] is None:
         del attrs[k]
 
     self.__openllm_config_override__ = __openllm_config_override__ or {}
     self.__openllm_extras__ = config_merger.merge(
-      first_not_none(__openllm_extras__, default={}),
-      {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__},
+      first_not_none(__openllm_extras__, default={}), {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__}
     )
     self.generation_config = self['generation_class'](_internal=True, **generation_config)
     self.sampling_config = self['sampling_class'].from_generation_config(self.generation_config, **sampling_config)
@@ -1013,9 +924,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
   @overload
   def __getitem__(self, item: t.Literal['bad_words_ids']) -> t.List[t.List[int]]: ...
   @overload
-  def __getitem__(
-    self, item: t.Literal['force_words_ids']
-  ) -> t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]: ...
+  def __getitem__(self, item: t.Literal['force_words_ids']) -> t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]]: ...
   @overload
   def __getitem__(self, item: t.Literal['renormalize_logits']) -> bool: ...
   @overload
@@ -1102,9 +1011,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       raise TypeError(f"{self} doesn't understand how to index None.")
     item = inflection.underscore(item)
     if item in _reserved_namespace:
-      raise ForbiddenAttributeError(
-        f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified."
-      )
+      raise ForbiddenAttributeError(f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified.")
     internal_attributes = f'__openllm_{item}__'
     if hasattr(self, internal_attributes):
       if item in self.__openllm_config_override__:
@@ -1125,9 +1032,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
 
   def __getattribute__(self, item: str) -> t.Any:
     if item in _reserved_namespace:
-      raise ForbiddenAttributeError(
-        f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified."
-      )
+      raise ForbiddenAttributeError(f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.")
     return _object_getattribute.__get__(self)(item)
 
   def __len__(self) -> int:
@@ -1241,9 +1146,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       sampling_config = attrs.pop('sampling_config')
     elif 'llm_config' in attrs:  # NOTE: this is the new key
       llm_config = attrs.pop('llm_config')
-      generation_config = {
-        k: v for k, v in llm_config.items() if k in attr.fields_dict(cls.__openllm_generation_class__)
-      }
+      generation_config = {k: v for k, v in llm_config.items() if k in attr.fields_dict(cls.__openllm_generation_class__)}
       sampling_config = {k: v for k, v in llm_config.items() if k in attr.fields_dict(cls.__openllm_sampling_class__)}
     else:
       generation_config = {k: v for k, v in attrs.items() if k in attr.fields_dict(cls.__openllm_generation_class__)}
@@ -1342,9 +1245,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
 
   def compatible_options(self, request: AttrsInstance) -> dict[str, t.Any]:
     if importlib.util.find_spec('openllm') is None:
-      raise MissingDependencyError(
-        "'openllm' is required to use 'compatible_options'. Make sure to install with 'pip install openllm'."
-      )
+      raise MissingDependencyError("'openllm' is required to use 'compatible_options'. Make sure to install with 'pip install openllm'.")
     from openllm.protocol.cohere import CohereChatRequest, CohereGenerateRequest
     from openllm.protocol.openai import ChatCompletionRequest, CompletionRequest
 
@@ -1391,17 +1292,11 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       return d
 
   @property
-  def template(self) -> str:
-    return '{system_message}{instruction}'
-
+  def template(self) -> str: return '{system_message}{instruction}'
   @property
-  def system_message(self) -> str:
-    return ''
-
+  def system_message(self) -> str: return ''
   @property
-  def chat_template(self) -> str | None:
-    return
-
+  def chat_template(self) -> str | None: return
   @property
   def chat_messages(self) -> list[MessageParam]:
     from ._schemas import MessageParam
@@ -1430,9 +1325,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       f = dantic.attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty, suffix_sampling=True)(f)
     f = cog.optgroup.group('SamplingParams sampling options')(f)
 
-    total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set(
-      attr.fields_dict(cls.__openllm_sampling_class__)
-    )
+    total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set(attr.fields_dict(cls.__openllm_sampling_class__))
 
     if len(cls.__openllm_accepted_keys__.difference(total_keys)) == 0:
       return t.cast('click.Command', f)
@@ -1453,16 +1346,12 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
 
   # deprecated
   def to_generation_config(self, return_as_dict: bool = False) -> transformers.GenerationConfig | DictStrAny:
-    warnings.warn(
-      "'to_generation_config' is deprecated, please use 'inference_options' instead.", DeprecationWarning, stacklevel=3
-    )
+    warnings.warn("'to_generation_config' is deprecated, please use 'inference_options' instead.", DeprecationWarning, stacklevel=3)
     _, config = self.inference_options(None, 'hf')
     return config.to_dict() if return_as_dict else config
 
   def to_sampling_config(self) -> vllm.SamplingParams:
-    warnings.warn(
-      "'to_sampling_config' is deprecated, please use 'inference_options' instead.", DeprecationWarning, stacklevel=3
-    )
+    warnings.warn("'to_sampling_config' is deprecated, please use 'inference_options' instead.", DeprecationWarning, stacklevel=3)
     return self.inference_options(None, 'vllm')[-1]
 
 
@@ -1506,8 +1395,5 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
 
 converter.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
 openllm_home = os.path.expanduser(
-  os.environ.get(
-    'OPENLLM_HOME',
-    os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'),
-  )
+  os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'))
 )
diff --git a/openllm-core/src/openllm_core/_schemas.py b/openllm-core/src/openllm_core/_schemas.py
index ef3f1bae..a84bd4de 100644
--- a/openllm-core/src/openllm_core/_schemas.py
+++ b/openllm-core/src/openllm_core/_schemas.py
@@ -62,12 +62,7 @@ class GenerationInput(_SchemaMixin):
     return cls.from_llm_config(AutoConfig.for_model(model_name, **attrs))
 
   def model_dump(self) -> dict[str, t.Any]:
-    return {
-      'prompt': self.prompt,
-      'stop': self.stop,
-      'llm_config': self.llm_config.model_dump(flatten=True),
-      'adapter_name': self.adapter_name,
-    }
+    return {'prompt': self.prompt, 'stop': self.stop, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name}
 
   @classmethod
   def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]:
@@ -176,9 +171,7 @@ class GenerationOutput(_SchemaMixin):
   @classmethod
   def from_dict(cls, structured: dict[str, t.Any]) -> GenerationOutput:
     if structured['prompt_logprobs']:
-      structured['prompt_logprobs'] = [
-        {int(k): v for k, v in it.items()} if it else None for it in structured['prompt_logprobs']
-      ]
+      structured['prompt_logprobs'] = [{int(k): v for k, v in it.items()} if it else None for it in structured['prompt_logprobs']]
     return cls(
       prompt=structured['prompt'],
       finished=structured['finished'],
@@ -223,6 +216,4 @@ class GenerationOutput(_SchemaMixin):
     return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
 
 
-converter.register_structure_hook_func(
-  lambda cls: attr.has(cls) and issubclass(cls, GenerationOutput), lambda data, cls: cls.from_dict(data)
-)
+converter.register_structure_hook_func(lambda cls: attr.has(cls) and issubclass(cls, GenerationOutput), lambda data, cls: cls.from_dict(data))
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index d43c8c45..0b4004d9 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -20,9 +20,7 @@ LiteralDtype = Literal['float16', 'float32', 'bfloat16', 'int8', 'int16']
 LiteralSerialisation = Literal['safetensors', 'legacy']
 LiteralQuantise = Literal['int8', 'int4', 'gptq', 'awq', 'squeezellm']
 LiteralBackend = Literal['pt', 'vllm', 'ctranslate', 'triton']  # TODO: ggml
-AdapterType = Literal[
-  'lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3', 'loha', 'lokr'
-]
+AdapterType = Literal['lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3', 'loha', 'lokr']
 LiteralVersionStrategy = Literal['release', 'nightly', 'latest', 'custom']
 
 
@@ -56,12 +54,7 @@ else:
 if sys.version_info[:2] >= (3, 10):
   from typing import Concatenate as Concatenate, ParamSpec as ParamSpec, TypeAlias as TypeAlias, TypeGuard as TypeGuard
 else:
-  from typing_extensions import (
-    Concatenate as Concatenate,
-    ParamSpec as ParamSpec,
-    TypeAlias as TypeAlias,
-    TypeGuard as TypeGuard,
-  )
+  from typing_extensions import Concatenate as Concatenate, ParamSpec as ParamSpec, TypeAlias as TypeAlias, TypeGuard as TypeGuard
 
 if sys.version_info[:2] >= (3, 9):
   from typing import Annotated as Annotated
diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py
index 0edc6ade..29e43b1c 100644
--- a/openllm-core/src/openllm_core/config/__init__.py
+++ b/openllm-core/src/openllm_core/config/__init__.py
@@ -1,9 +1,5 @@
 # AUTOGENERATED BY update-config-stubs.py. DO NOT EDIT
-from .configuration_auto import (
-  CONFIG_MAPPING as CONFIG_MAPPING,
-  CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-  AutoConfig as AutoConfig,
-)
+from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig
 from .configuration_baichuan import BaichuanConfig as BaichuanConfig
 from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
 from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
index a8e7c1b5..49b3a473 100644
--- a/openllm-core/src/openllm_core/config/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -97,12 +97,7 @@ class _LazyConfigMapping(OrderedDictType, ReprMixin):
 
 CONFIG_MAPPING: dict[LiteralString, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
 # The below handle special alias when we call underscore to the name directly without processing camelcase first.
-CONFIG_NAME_ALIASES: dict[str, str] = {
-  'chat_glm': 'chatglm',
-  'stable_lm': 'stablelm',
-  'star_coder': 'starcoder',
-  'gpt_neo_x': 'gpt_neox',
-}
+CONFIG_NAME_ALIASES: dict[str, str] = {'chat_glm': 'chatglm', 'stable_lm': 'stablelm', 'star_coder': 'starcoder', 'gpt_neo_x': 'gpt_neox'}
 CONFIG_FILE_NAME = 'config.json'
 
 
@@ -159,16 +154,14 @@ class AutoConfig:
   @t.overload
   @classmethod
   def for_model(cls, model_name: t.Literal['yi'], **attrs: t.Any) -> openllm_core.config.YiConfig: ...
+  # update-config-stubs.py: auto stubs stop
 
   @classmethod
-  # update-config-stubs.py: auto stubs stop
   def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig:
     model_name = inflection.underscore(model_name)
     if model_name in CONFIG_MAPPING:
       return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
-    raise ValueError(
-      f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
-    )
+    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
 
   @classmethod
   def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]:
@@ -177,9 +170,7 @@ class AutoConfig:
       model_name = CONFIG_NAME_ALIASES[model_name]
     if model_name in CONFIG_MAPPING:
       return CONFIG_MAPPING[model_name]
-    raise ValueError(
-      f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
-    )
+    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
 
   _cached_mapping = None
 
@@ -200,9 +191,7 @@ class AutoConfig:
         config_file = llm.bentomodel.path_of(CONFIG_FILE_NAME)
       except OpenLLMException as err:
         if not is_transformers_available():
-          raise MissingDependencyError(
-            "Requires 'transformers' to be available. Do 'pip install transformers'"
-          ) from err
+          raise MissingDependencyError("Requires 'transformers' to be available. Do 'pip install transformers'") from err
         from transformers.utils import cached_file
 
         try:
@@ -219,6 +208,4 @@ class AutoConfig:
       for architecture in loaded_config['architectures']:
         if architecture in cls._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
           return cls.infer_class_from_name(cls._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
-    raise ValueError(
-      f"Failed to determine config class for '{llm.model_id}'. Make sure {llm.model_id} is saved with openllm."
-    )
+    raise ValueError(f"Failed to determine config class for '{llm.model_id}'. Make sure {llm.model_id} is saved with openllm.")
diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
index 1114c22a..0fc48b1e 100644
--- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
+++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
@@ -9,9 +9,7 @@ if t.TYPE_CHECKING:
 INSTRUCTION_KEY = '### Instruction:'
 RESPONSE_KEY = '### Response:'
 END_KEY = '### End'
-INTRO_BLURB = (
-  'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
-)
+INTRO_BLURB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
 
 
 def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
index 4de691f6..8c4c42c4 100644
--- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py
+++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
@@ -18,13 +18,7 @@ class FlanT5Config(openllm_core.LLMConfig):
     'backend': ('pt',),
     # NOTE: See https://www.philschmid.de/fine-tune-flan-t5. No specific template found, but seems to have the same dialogue style
     'default_id': 'google/flan-t5-large',
-    'model_ids': [
-      'google/flan-t5-small',
-      'google/flan-t5-base',
-      'google/flan-t5-large',
-      'google/flan-t5-xl',
-      'google/flan-t5-xxl',
-    ],
+    'model_ids': ['google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl', 'google/flan-t5-xxl'],
   }
 
   class GenerationConfig:
diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py
index ff88db9f..8f034453 100644
--- a/openllm-core/src/openllm_core/config/configuration_llama.py
+++ b/openllm-core/src/openllm_core/config/configuration_llama.py
@@ -38,9 +38,7 @@ class LlamaConfig(openllm_core.LLMConfig):
       'NousResearch/llama-2-13b-hf',
       'NousResearch/llama-2-7b-hf',
     ],
-    'fine_tune_strategies': (
-      {'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'},
-    ),
+    'fine_tune_strategies': ({'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'},),
   }
 
   class GenerationConfig:
@@ -56,11 +54,7 @@ class LlamaConfig(openllm_core.LLMConfig):
   @property
   def template(self) -> str:
     return '{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key}\n'.format(
-      start_key=SINST_KEY,
-      sys_key=SYS_KEY,
-      system_message='{system_message}',
-      instruction='{instruction}',
-      end_key=EINST_KEY,
+      start_key=SINST_KEY, sys_key=SYS_KEY, system_message='{system_message}', instruction='{instruction}', end_key=EINST_KEY
     )
 
   @property
diff --git a/openllm-core/src/openllm_core/config/configuration_mistral.py b/openllm-core/src/openllm_core/config/configuration_mistral.py
index f608ea8b..21cc1723 100644
--- a/openllm-core/src/openllm_core/config/configuration_mistral.py
+++ b/openllm-core/src/openllm_core/config/configuration_mistral.py
@@ -30,9 +30,7 @@ class MistralConfig(openllm_core.LLMConfig):
       'mistralai/Mistral-7B-Instruct-v0.1',
       'mistralai/Mistral-7B-v0.1',
     ],
-    'fine_tune_strategies': (
-      {'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'},
-    ),
+    'fine_tune_strategies': ({'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'},),
   }
 
   class GenerationConfig:
@@ -49,11 +47,7 @@ class MistralConfig(openllm_core.LLMConfig):
   @property
   def template(self) -> str:
     return """{start_key}{start_inst} {system_message} {instruction} {end_inst}\n""".format(
-      start_inst=SINST_KEY,
-      end_inst=EINST_KEY,
-      start_key=BOS_TOKEN,
-      system_message='{system_message}',
-      instruction='{instruction}',
+      start_inst=SINST_KEY, end_inst=EINST_KEY, start_key=BOS_TOKEN, system_message='{system_message}', instruction='{instruction}'
     )
 
   # NOTE: https://docs.mistral.ai/usage/guardrailing/
diff --git a/openllm-core/src/openllm_core/config/configuration_mixtral.py b/openllm-core/src/openllm_core/config/configuration_mixtral.py
index e216070a..1c019bbb 100644
--- a/openllm-core/src/openllm_core/config/configuration_mixtral.py
+++ b/openllm-core/src/openllm_core/config/configuration_mixtral.py
@@ -35,11 +35,7 @@ class MixtralConfig(openllm_core.LLMConfig):
   @property
   def template(self) -> str:
     return """{start_key}{start_inst} {system_message} {instruction} {end_inst}\n""".format(
-      start_inst=SINST_KEY,
-      end_inst=EINST_KEY,
-      start_key=BOS_TOKEN,
-      system_message='{system_message}',
-      instruction='{instruction}',
+      start_inst=SINST_KEY, end_inst=EINST_KEY, start_key=BOS_TOKEN, system_message='{system_message}', instruction='{instruction}'
     )
 
   # NOTE: https://docs.mistral.ai/usage/guardrailing/
diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py
index 1676896d..e403f726 100644
--- a/openllm-core/src/openllm_core/config/configuration_opt.py
+++ b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -20,23 +20,9 @@ class OPTConfig(openllm_core.LLMConfig):
     'url': 'https://huggingface.co/docs/transformers/model_doc/opt',
     'default_id': 'facebook/opt-1.3b',
     'architecture': 'OPTForCausalLM',
-    'model_ids': [
-      'facebook/opt-125m',
-      'facebook/opt-350m',
-      'facebook/opt-1.3b',
-      'facebook/opt-2.7b',
-      'facebook/opt-6.7b',
-      'facebook/opt-66b',
-    ],
+    'model_ids': ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-66b'],
     'fine_tune_strategies': (
-      {
-        'adapter_type': 'lora',
-        'r': 16,
-        'lora_alpha': 32,
-        'target_modules': ['q_proj', 'v_proj'],
-        'lora_dropout': 0.05,
-        'bias': 'none',
-      },
+      {'adapter_type': 'lora', 'r': 16, 'lora_alpha': 32, 'target_modules': ['q_proj', 'v_proj'], 'lora_dropout': 0.05, 'bias': 'none'},
     ),
   }
 
diff --git a/openllm-core/src/openllm_core/config/configuration_phi.py b/openllm-core/src/openllm_core/config/configuration_phi.py
index 2c56b8c4..40b537e9 100644
--- a/openllm-core/src/openllm_core/config/configuration_phi.py
+++ b/openllm-core/src/openllm_core/config/configuration_phi.py
@@ -27,9 +27,7 @@ class PhiConfig(openllm_core.LLMConfig):
     'default_id': 'microsoft/phi-1_5',
     'serialisation': 'safetensors',
     'model_ids': ['microsoft/phi-1_5'],
-    'fine_tune_strategies': (
-      {'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'},
-    ),
+    'fine_tune_strategies': ({'adapter_type': 'lora', 'r': 64, 'lora_alpha': 16, 'lora_dropout': 0.1, 'bias': 'none'},),
   }
 
   class GenerationConfig:
@@ -49,13 +47,7 @@ class PhiConfig(openllm_core.LLMConfig):
     from openllm_core._schemas import MessageParam
 
     return [
-      MessageParam(
-        role='user', content="I don't know why, I'm struggling to maintain focus while studying. Any suggestions?"
-      ),
-      MessageParam(
-        role='assistant', content='Have you tried using a timer? It can help you stay on track and avoid distractions.'
-      ),
-      MessageParam(
-        role='user', content="That's a good idea. I'll give it a try. What else can I do to boost my productivity?"
-      ),
+      MessageParam(role='user', content="I don't know why, I'm struggling to maintain focus while studying. Any suggestions?"),
+      MessageParam(role='assistant', content='Have you tried using a timer? It can help you stay on track and avoid distractions.'),
+      MessageParam(role='user', content="That's a good idea. I'll give it a try. What else can I do to boost my productivity?"),
     ]
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index 6ce63e7c..085d6423 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -24,9 +24,7 @@ def _WithArgsTypes() -> tuple[type[t.Any], ...]:
   except ImportError:
     _TypingGenericAlias = ()  # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
   #  _GenericAlias is the actual GenericAlias implementation
-  return (
-    (_TypingGenericAlias,) if sys.version_info < (3, 10) else (t._GenericAlias, types.GenericAlias, types.UnionType)
-  )
+  return (_TypingGenericAlias,) if sys.version_info < (3, 10) else (t._GenericAlias, types.GenericAlias, types.UnionType)
 
 
 def lenient_issubclass(cls, class_or_tuple):
@@ -209,9 +207,7 @@ def flatten_attrs(**attrs):
 # Special debug flag controled via DEBUG
 DEBUG = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False))
 # Whether to show the codenge for debug purposes
-SHOW_CODEGEN = (
-  DEBUG and os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3
-)
+SHOW_CODEGEN = DEBUG and os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY = False
 
@@ -261,11 +257,7 @@ _LOGGING_CONFIG = {
     'warningfilter': {'()': 'openllm_core.utils.WarningFilter'},
   },
   'handlers': {
-    'bentomlhandler': {
-      'class': 'logging.StreamHandler',
-      'filters': ['excfilter', 'warningfilter', 'infofilter'],
-      'stream': 'ext://sys.stdout',
-    },
+    'bentomlhandler': {'class': 'logging.StreamHandler', 'filters': ['excfilter', 'warningfilter', 'infofilter'], 'stream': 'ext://sys.stdout'},
     'defaulthandler': {'class': 'logging.StreamHandler', 'level': logging.WARNING},
   },
   'loggers': {
@@ -301,9 +293,7 @@ def configure_logging():
 # since _extras will be the locals() import from this file.
 _extras = {
   **{
-    k: v
-    for k, v in locals().items()
-    if k in {'pkg'} or (not isinstance(v, types.ModuleType) and k not in {'annotations'} and not k.startswith('_'))
+    k: v for k, v in locals().items() if k in {'pkg'} or (not isinstance(v, types.ModuleType) and k not in {'annotations'} and not k.startswith('_'))
   },
   '__openllm_migration__': {'bentoml_cattr': 'converter'},
 }
diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py
index 0ee37ecb..3c1d43c1 100644
--- a/openllm-core/src/openllm_core/utils/codegen.py
+++ b/openllm-core/src/openllm_core/utils/codegen.py
@@ -121,11 +121,7 @@ def generate_function(
 ) -> AnyCallable:
   from openllm_core.utils import SHOW_CODEGEN
 
-  script = 'def %s(%s):\n    %s\n' % (
-    func_name,
-    ', '.join(args) if args is not None else '',
-    '\n    '.join(lines) if lines else 'pass',
-  )
+  script = 'def %s(%s):\n    %s\n' % (func_name, ', '.join(args) if args is not None else '', '\n    '.join(lines) if lines else 'pass')
   meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
   if annotations:
     meth.__annotations__ = annotations
@@ -197,4 +193,4 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
   )
 
 
-__all__ = ['gen_sdk', 'generate_function', 'generate_unique_filename', 'make_attr_tuple_class', 'make_env_transformer']
+__all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function']
diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py
index b723b816..a25c6b40 100644
--- a/openllm-core/src/openllm_core/utils/dantic.py
+++ b/openllm-core/src/openllm_core/utils/dantic.py
@@ -20,23 +20,23 @@ AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar('FC', bound=t.Union[AnyCallable, click.Command])
 
 __all__ = [
-  'CUDA',
   'FC',
-  'BytesType',
-  'EnumChoice',
-  'Field',
-  'JsonType',
-  'LiteralChoice',
-  'ModuleType',
-  'allows_multiple',
   'attrs_to_options',
-  'is_container',
-  'is_literal',
-  'is_mapping',
+  'Field',
+  'parse_type',
   'is_typing',
+  'is_literal',
+  'ModuleType',
+  'EnumChoice',
+  'LiteralChoice',
+  'allows_multiple',
+  'is_mapping',
+  'is_container',
   'parse_container_args',
   'parse_single_arg',
-  'parse_type',
+  'CUDA',
+  'JsonType',
+  'BytesType',
 ]
 
 
@@ -45,12 +45,7 @@ def __dir__() -> list[str]:
 
 
 def attrs_to_options(
-  name: str,
-  field: attr.Attribute[t.Any],
-  model_name: str,
-  typ: t.Any = None,
-  suffix_generation: bool = False,
-  suffix_sampling: bool = False,
+  name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any = None, suffix_generation: bool = False, suffix_sampling: bool = False
 ) -> t.Callable[[FC], FC]:
   # TODO: support parsing nested attrs class and Union
   envvar = field.metadata['env']
diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
index cc5661a7..7d36e6c4 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -1,17 +1,6 @@
 import importlib, importlib.metadata, importlib.util, os
 
-OPTIONAL_DEPENDENCIES = {
-  'vllm',
-  'fine-tune',
-  'ggml',
-  'ctranslate',
-  'agents',
-  'openai',
-  'playground',
-  'gptq',
-  'grpc',
-  'awq',
-}
+OPTIONAL_DEPENDENCIES = {'vllm', 'fine-tune', 'ggml', 'ctranslate', 'agents', 'openai', 'playground', 'gptq', 'grpc', 'awq'}
 ENV_VARS_TRUE_VALUES = {'1', 'ON', 'YES', 'TRUE'}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({'AUTO'})
 USE_VLLM = os.getenv('USE_VLLM', 'AUTO').upper()
diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py
index d4dc0134..0bcf6d36 100644
--- a/openllm-core/src/openllm_core/utils/lazy.py
+++ b/openllm-core/src/openllm_core/utils/lazy.py
@@ -17,7 +17,7 @@ import attr
 
 import openllm_core
 
-__all__ = ['LazyLoader', 'LazyModule', 'VersionInfo']
+__all__ = ['VersionInfo', 'LazyModule', 'LazyLoader']
 
 logger = logging.getLogger(__name__)
 
@@ -107,9 +107,7 @@ class VersionInfo:
       raise NotImplementedError
     if not (1 <= len(cmp) <= 4):
       raise NotImplementedError
-    return t.cast(t.Tuple[int, int, int, str], attr.astuple(self)[: len(cmp)]), t.cast(
-      t.Tuple[int, int, int, str], cmp
-    )
+    return t.cast(t.Tuple[int, int, int, str], attr.astuple(self)[: len(cmp)]), t.cast(t.Tuple[int, int, int, str], cmp)
 
   def __eq__(self, other: t.Any) -> bool:
     try:
@@ -231,9 +229,7 @@ class LazyModule(types.ModuleType):
       cur_value = self._objects['__openllm_migration__'].get(name, _sentinel)
       if cur_value is not _sentinel:
         warnings.warn(
-          f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead",
-          DeprecationWarning,
-          stacklevel=3,
+          f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", DeprecationWarning, stacklevel=3
         )
         return getattr(self, cur_value)
     if name in self._objects:
@@ -254,9 +250,7 @@ class LazyModule(types.ModuleType):
     try:
       return importlib.import_module('.' + module_name, self.__name__)
     except Exception as e:
-      raise RuntimeError(
-        f'Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}'
-      ) from e
+      raise RuntimeError(f'Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}') from e
 
   # make sure this module is picklable
   def __reduce__(self) -> tuple[type[LazyModule], tuple[str, str | None, dict[str, list[str]]]]:
diff --git a/openllm-core/src/openllm_core/utils/peft.py b/openllm-core/src/openllm_core/utils/peft.py
index b0b25fd5..f8d85a65 100644
--- a/openllm-core/src/openllm_core/utils/peft.py
+++ b/openllm-core/src/openllm_core/utils/peft.py
@@ -87,12 +87,8 @@ class FineTuneConfig:
     converter=attr.converters.default_if_none(factory=dict),
     use_default_converter=False,
   )
-  inference_mode: bool = dantic.Field(
-    False, description='Whether to use this Adapter for inference', use_default_converter=False
-  )
-  llm_config_class: type[LLMConfig] = dantic.Field(
-    None, description='The reference class to openllm.LLMConfig', use_default_converter=False
-  )
+  inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False)
+  llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False)
 
   def build(self) -> PeftConfig:
     try:
@@ -110,12 +106,7 @@ class FineTuneConfig:
     # respect user set task_type if it is passed, otherwise use one managed by OpenLLM
     inference_mode = adapter_config.pop('inference_mode', self.inference_mode)
     task_type = adapter_config.pop('task_type', TaskType[self.llm_config_class.peft_task_type()])
-    adapter_config = {
-      'peft_type': self.adapter_type.value,
-      'task_type': task_type,
-      'inference_mode': inference_mode,
-      **adapter_config,
-    }
+    adapter_config = {'peft_type': self.adapter_type.value, 'task_type': task_type, 'inference_mode': inference_mode, **adapter_config}
     return get_peft_config(adapter_config)
 
   def train(self) -> FineTuneConfig:
@@ -127,18 +118,10 @@ class FineTuneConfig:
     return self
 
   def with_config(self, **attrs: t.Any) -> FineTuneConfig:
-    adapter_type, inference_mode = (
-      attrs.pop('adapter_type', self.adapter_type),
-      attrs.get('inference_mode', self.inference_mode),
-    )
+    adapter_type, inference_mode = (attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode))
     if 'llm_config_class' in attrs:
       raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.")
-    return attr.evolve(
-      self,
-      adapter_type=adapter_type,
-      inference_mode=inference_mode,
-      adapter_config=config_merger.merge(self.adapter_config, attrs),
-    )
+    return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs))
 
   @classmethod
   def from_config(cls, ft_config: dict[str, t.Any], llm_config_cls: type[LLMConfig]) -> FineTuneConfig:
@@ -146,9 +129,4 @@ class FineTuneConfig:
     adapter_type = copied.pop('adapter_type', 'lora')
     inference_mode = copied.pop('inference_mode', False)
     llm_config_class = copied.pop('llm_confg_class', llm_config_cls)
-    return cls(
-      adapter_type=adapter_type,
-      adapter_config=copied,
-      inference_mode=inference_mode,
-      llm_config_class=llm_config_class,
-    )
+    return cls(adapter_type=adapter_type, adapter_config=copied, inference_mode=inference_mode, llm_config_class=llm_config_class)
diff --git a/openllm-core/src/openllm_core/utils/pkg.py b/openllm-core/src/openllm_core/utils/pkg.py
index 3b271c2f..1040e209 100644
--- a/openllm-core/src/openllm_core/utils/pkg.py
+++ b/openllm-core/src/openllm_core/utils/pkg.py
@@ -8,7 +8,7 @@ from typing import cast
 
 from packaging.version import Version
 
-__all__ = ['PackageNotFoundError', 'find_spec', 'get_pkg_version', 'pkg_version_info', 'source_locations']
+__all__ = ['PackageNotFoundError', 'pkg_version_info', 'get_pkg_version', 'source_locations', 'find_spec']
 
 get_pkg_version = importlib.metadata.version
 find_spec = importlib.util.find_spec
diff --git a/openllm-core/src/openllm_core/utils/serde.py b/openllm-core/src/openllm_core/utils/serde.py
index 10d35255..c03a1653 100644
--- a/openllm-core/src/openllm_core/utils/serde.py
+++ b/openllm-core/src/openllm_core/utils/serde.py
@@ -19,16 +19,10 @@ def datetime_structure_hook(dt_like: str | datetime | t.Any, _: t.Any) -> dateti
 
 
 converter.register_structure_hook_factory(
-  attr.has,
-  lambda cls: make_dict_structure_fn(
-    cls, converter, _cattrs_forbid_extra_keys=getattr(cls, '__forbid_extra_keys__', False)
-  ),
+  attr.has, lambda cls: make_dict_structure_fn(cls, converter, _cattrs_forbid_extra_keys=getattr(cls, '__forbid_extra_keys__', False))
 )
 converter.register_unstructure_hook_factory(
-  attr.has,
-  lambda cls: make_dict_unstructure_fn(
-    cls, converter, _cattrs_omit_if_default=getattr(cls, '__omit_if_default__', False)
-  ),
+  attr.has, lambda cls: make_dict_unstructure_fn(cls, converter, _cattrs_omit_if_default=getattr(cls, '__omit_if_default__', False))
 )
 converter.register_structure_hook(datetime, datetime_structure_hook)
 converter.register_unstructure_hook(datetime, lambda dt: dt.isoformat())
diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py
index 12bf827b..f53db436 100644
--- a/openllm-python/src/openllm/__init__.py
+++ b/openllm-python/src/openllm/__init__.py
@@ -9,16 +9,10 @@ else:
   # configuration for bitsandbytes before import
   _os.environ['BITSANDBYTES_NOWELCOME'] = _os.environ.get('BITSANDBYTES_NOWELCOME', '1')
   # NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
-  _warnings.filterwarnings(
-    'ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization'
-  )
-  _warnings.filterwarnings(
-    'ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization'
-  )
+  _warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
+  _warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
   _warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
-  _warnings.filterwarnings(
-    'ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated'
-  )
+  _warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
 COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
 __lazy = utils.LazyModule(  # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
   __name__,
diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi
index 3d2e8b6b..4b65bc5a 100644
--- a/openllm-python/src/openllm/__init__.pyi
+++ b/openllm-python/src/openllm/__init__.pyi
@@ -11,48 +11,14 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
 """
 
 # update-config-stubs.py: import stubs start
-from openlm_core.config import (
-  CONFIG_MAPPING as CONFIG_MAPPING,
-  CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-  AutoConfig as AutoConfig,
-  BaichuanConfig as BaichuanConfig,
-  ChatGLMConfig as ChatGLMConfig,
-  DollyV2Config as DollyV2Config,
-  FalconConfig as FalconConfig,
-  FlanT5Config as FlanT5Config,
-  GPTNeoXConfig as GPTNeoXConfig,
-  LlamaConfig as LlamaConfig,
-  MistralConfig as MistralConfig,
-  MixtralConfig as MixtralConfig,
-  MPTConfig as MPTConfig,
-  OPTConfig as OPTConfig,
-  PhiConfig as PhiConfig,
-  QwenConfig as QwenConfig,
-  StableLMConfig as StableLMConfig,
-  StarCoderConfig as StarCoderConfig,
-  YiConfig as YiConfig,
-)
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
 # update-config-stubs.py: import stubs stop
 
 from openllm_cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start
-from openllm_core._configuration import (
-  GenerationConfig as GenerationConfig,
-  LLMConfig as LLMConfig,
-  SamplingParams as SamplingParams,
-)
-from openllm_core._schemas import (
-  GenerationInput as GenerationInput,
-  GenerationOutput as GenerationOutput,
-  MetadataOutput as MetadataOutput,
-)
+from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
+from openllm_core._schemas import GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, MetadataOutput as MetadataOutput
 
-from . import (
-  bundle as bundle,
-  client as client,
-  exceptions as exceptions,
-  serialisation as serialisation,
-  utils as utils,
-)
+from . import bundle as bundle, client as client, exceptions as exceptions, serialisation as serialisation, utils as utils
 from ._deprecated import Runner as Runner
 from ._llm import LLM as LLM
 from ._quantisation import infer_quantisation_config as infer_quantisation_config
diff --git a/openllm-python/src/openllm/_deprecated.py b/openllm-python/src/openllm/_deprecated.py
index c098b3b0..559ad0b2 100644
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -19,9 +19,7 @@ def Runner(
   if llm_config is None:
     llm_config = openllm.AutoConfig.for_model(model_name)
   if not ensure_available:
-    logger.warning(
-      "'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
-    )
+    logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
   model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
   warnings.warn(
     f"""\
@@ -42,13 +40,8 @@ def Runner(
   attrs.update({
     'model_id': model_id,
     'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),  #
-    'serialisation': getenv(
-      'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
-    ),
+    'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
   })
   return openllm.LLM(
-    backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'),
-    llm_config=llm_config,
-    embedded=init_local,
-    **attrs,
+    backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs
   ).runner
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index cdb5bd02..34291695 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -47,9 +47,7 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]
 
 @attr.define(slots=False, repr=False, init=False)
 class LLM(t.Generic[M, T]):
-  async def generate(
-    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
-  ):
+  async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
     if adapter_name is not None and self.__llm_backend__ != 'pt':
       raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
     config = self.config.model_construct_env(**attrs)
@@ -64,15 +62,10 @@ class LLM(t.Generic[M, T]):
       raise RuntimeError('No result is returned.')
     return final_result.with_options(
       prompt=prompt,
-      outputs=[
-        output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
-        for output in final_result.outputs
-      ],
+      outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs],
     )
 
-  async def generate_iterator(
-    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
-  ):
+  async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
     from bentoml._internal.runner.runner_handle import DummyRunnerHandle
 
     if adapter_name is not None and self.__llm_backend__ != 'pt':
@@ -137,9 +130,7 @@ class LLM(t.Generic[M, T]):
   # The below are mainly for internal implementation that you don't have to worry about.
   _model_id: str
   _revision: t.Optional[str]  #
-  _quantization_config: t.Optional[
-    t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
-  ]
+  _quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
   _quantise: t.Optional[LiteralQuantise]
   _model_decls: t.Tuple[t.Any, ...]
   __model_attrs: t.Dict[str, t.Any]  #
@@ -155,9 +146,7 @@ class LLM(t.Generic[M, T]):
   __llm_torch_dtype__: 'torch.dtype' = None
   __llm_config__: t.Optional[LLMConfig] = None
   __llm_backend__: LiteralBackend = None
-  __llm_quantization_config__: t.Optional[
-    t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
-  ] = None
+  __llm_quantization_config__: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]] = None
   __llm_runner__: t.Optional[Runner[M, T]] = None
   __llm_model__: t.Optional[M] = None
   __llm_tokenizer__: t.Optional[T] = None
@@ -188,9 +177,7 @@ class LLM(t.Generic[M, T]):
     torch_dtype = attrs.pop('torch_dtype', None)  # backward compatible
     if torch_dtype is not None:
       warnings.warn(
-        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
-        DeprecationWarning,
-        stacklevel=3,
+        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3
       )
       dtype = torch_dtype
     _local = False
@@ -246,19 +233,13 @@ class LLM(t.Generic[M, T]):
 
   class _Quantise:
     @staticmethod
-    def pt(llm: LLM, quantise=None):
-      return quantise
-
+    def pt(llm: LLM, quantise=None): return quantise
     @staticmethod
-    def vllm(llm: LLM, quantise=None):
-      return quantise
-
+    def vllm(llm: LLM, quantise=None): return quantise
     @staticmethod
     def ctranslate(llm: LLM, quantise=None):
-      if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
-        raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
-      if quantise == 'int8':
-        quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
+      if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}: raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
+      if quantise == 'int8': quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
       return quantise
 
   @apply(lambda val: tuple(str.lower(i) if i else i for i in val))
@@ -266,15 +247,10 @@ class LLM(t.Generic[M, T]):
     model_id, *maybe_revision = model_id.rsplit(':')
     if len(maybe_revision) > 0:
       if model_version is not None:
-        logger.warning(
-          "revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
-        )
+        logger.warning("revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
       model_version = maybe_revision[0]
     if validate_is_path(model_id):
-      model_id, model_version = (
-        resolve_filepath(model_id),
-        first_not_none(model_version, default=generate_hash_from_file(model_id)),
-      )
+      model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
     return f'{backend}-{normalise_model_name(model_id)}', model_version
 
   @functools.cached_property
@@ -283,11 +259,9 @@ class LLM(t.Generic[M, T]):
       from cuda import cuda
 
       err, *_ = cuda.cuInit(0)
-      if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
       err, _ = cuda.cuDeviceGetCount()
-      if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError('Failed to get CUDA device count.')
+      if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to get CUDA device count.')
       return True
     except (ImportError, RuntimeError):
       return False
@@ -299,9 +273,7 @@ class LLM(t.Generic[M, T]):
     _map = _torch_dtype_mapping()
     if not isinstance(self.__llm_torch_dtype__, torch.dtype):
       try:
-        hf_config = transformers.AutoConfig.from_pretrained(
-          self.bentomodel.path, trust_remote_code=self.trust_remote_code
-        )
+        hf_config = transformers.AutoConfig.from_pretrained(self.bentomodel.path, trust_remote_code=self.trust_remote_code)
       except OpenLLMException:
         hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
       config_dtype = getattr(hf_config, 'torch_dtype', None)
@@ -332,9 +304,7 @@ class LLM(t.Generic[M, T]):
     return {**self.import_kwargs[1], **self.__tokenizer_attrs}
 
   def _cascade_backend(self) -> LiteralBackend:
-    logger.warning(
-      'It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.'
-    )
+    logger.warning('It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.')
     if self._has_gpus:
       if is_vllm_available():
         return 'vllm'
@@ -369,10 +339,7 @@ class LLM(t.Generic[M, T]):
 
   @property
   def import_kwargs(self):
-    return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
-      'padding_side': 'left',
-      'truncation_side': 'left',
-    }
+    return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {'padding_side': 'left', 'truncation_side': 'left'}
 
   @property
   def trust_remote_code(self):
@@ -405,9 +372,7 @@ class LLM(t.Generic[M, T]):
       if self._quantization_config is not None:
         self.__llm_quantization_config__ = self._quantization_config
       elif self._quantise is not None:
-        self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
-          self, self._quantise, **self._model_attrs
-        )
+        self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self, self._quantise, **self._model_attrs)
       else:
         raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
     return self.__llm_quantization_config__
@@ -462,11 +427,7 @@ class LLM(t.Generic[M, T]):
 
     model = get_peft_model(
       prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
-      self.config['fine_tune_strategies']
-      .get(adapter_type, self.config.make_fine_tune_config(adapter_type))
-      .train()
-      .with_config(**attrs)
-      .build(),
+      self.config['fine_tune_strategies'].get(adapter_type, self.config.make_fine_tune_config(adapter_type)).train().with_config(**attrs).build(),
     )
     if DEBUG:
       model.print_trainable_parameters()
@@ -486,10 +447,7 @@ class LLM(t.Generic[M, T]):
     if self.__llm_adapter_map__ is None:
       _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
       for adapter_type, adapter_tuple in self._adapter_map.items():
-        base = first_not_none(
-          self.config['fine_tune_strategies'].get(adapter_type),
-          default=self.config.make_fine_tune_config(adapter_type),
-        )
+        base = first_not_none(self.config['fine_tune_strategies'].get(adapter_type), default=self.config.make_fine_tune_config(adapter_type))
         for adapter in adapter_tuple:
           _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
       self.__llm_adapter_map__ = _map
@@ -504,9 +462,7 @@ class LLM(t.Generic[M, T]):
         import torch
 
         loaded_in_kbit = (
-          getattr(model, 'is_loaded_in_8bit', False)
-          or getattr(model, 'is_loaded_in_4bit', False)
-          or getattr(model, 'is_quantized', False)
+          getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
         )
         if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
           try:
@@ -528,9 +484,7 @@ class LLM(t.Generic[M, T]):
     if self.__llm_config__ is None:
       if self.__llm_backend__ == 'ctranslate':
         try:
-          config = transformers.AutoConfig.from_pretrained(
-            self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code
-          )
+          config = transformers.AutoConfig.from_pretrained(self.bentomodel.path_of('/hf'), trust_remote_code=self.trust_remote_code)
         except OpenLLMException:
           config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
         for architecture in config.architectures:
@@ -563,18 +517,12 @@ def _torch_dtype_mapping() -> dict[str, torch.dtype]:
 
 
 def normalise_model_name(name: str) -> str:
-  return (
-    os.path.basename(resolve_filepath(name))
-    if validate_is_path(name)
-    else inflection.dasherize(name.replace('/', '--'))
-  )
+  return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/', '--'))
 
 
 def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
   if not is_peft_available():
-    raise RuntimeError(
-      "LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
-    )
+    raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
   from huggingface_hub import hf_hub_download
 
   resolved: AdapterMap = {}
diff --git a/openllm-python/src/openllm/_llm.pyi b/openllm-python/src/openllm/_llm.pyi
index d4c62bd6..910a12df 100644
--- a/openllm-python/src/openllm/_llm.pyi
+++ b/openllm-python/src/openllm/_llm.pyi
@@ -8,16 +8,7 @@ from peft.peft_model import PeftModel, PeftModelForCausalLM, PeftModelForSeq2Seq
 from bentoml import Model, Tag
 from openllm_core import LLMConfig
 from openllm_core._schemas import GenerationOutput
-from openllm_core._typing_compat import (
-  AdapterMap,
-  AdapterType,
-  LiteralBackend,
-  LiteralDtype,
-  LiteralQuantise,
-  LiteralSerialisation,
-  M,
-  T,
-)
+from openllm_core._typing_compat import AdapterMap, AdapterType, LiteralBackend, LiteralDtype, LiteralQuantise, LiteralSerialisation, M, T
 
 from ._quantisation import QuantizationConfig
 from ._runners import Runner
@@ -121,9 +112,7 @@ class LLM(Generic[M, T]):
   def runner(self) -> Runner[M, T]: ...
   @property
   def adapter_map(self) -> ResolvedAdapterMap: ...
-  def prepare(
-    self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any
-  ) -> Tuple[InjectedModel, T]: ...
+  def prepare(self, adapter_type: AdapterType = ..., use_gradient_checking: bool = ..., **attrs: Any) -> Tuple[InjectedModel, T]: ...
   async def generate(
     self,
     prompt: Optional[str],
diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
index e6bee318..1765a564 100644
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -83,25 +83,19 @@ def infer_quantisation_config(llm, quantise, **attrs):
 
   # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
   if not is_bitsandbytes_available():
-    raise RuntimeError(
-      'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
-    )
+    raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
   if quantise == 'int8':
     quantisation_config = create_int8_config(int8_skip_modules)
   elif quantise == 'int4':
     quantisation_config = create_int4_config()
   elif quantise == 'gptq':
     if not is_autogptq_available():
-      raise MissingDependencyError(
-        "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
-      )
+      raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
     else:
       quantisation_config = create_gptq_config()
   elif quantise == 'awq':
     if not is_autoawq_available():
-      raise MissingDependencyError(
-        "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
-      )
+      raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
     else:
       quantisation_config = create_awq_config()
   else:
diff --git a/openllm-python/src/openllm/_quantisation.pyi b/openllm-python/src/openllm/_quantisation.pyi
index d41809f7..a3e01878 100644
--- a/openllm-python/src/openllm/_quantisation.pyi
+++ b/openllm-python/src/openllm/_quantisation.pyi
@@ -9,18 +9,10 @@ from ._llm import LLM
 QuantizationConfig = Union[BitsAndBytesConfig, GPTQConfig, AwqConfig]
 
 @overload
-def infer_quantisation_config(
-  self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any
-) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
+def infer_quantisation_config(self: LLM[M, T], quantise: Literal['int8', 'int4'], **attrs: Any) -> tuple[BitsAndBytesConfig, Dict[str, Any]]: ...
 @overload
-def infer_quantisation_config(
-  self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any
-) -> tuple[GPTQConfig, Dict[str, Any]]: ...
+def infer_quantisation_config(self: LLM[M, T], quantise: Literal['gptq'], **attrs: Any) -> tuple[GPTQConfig, Dict[str, Any]]: ...
 @overload
-def infer_quantisation_config(
-  self: LLM[M, T], quantise: Literal['awq'], **attrs: Any
-) -> tuple[AwqConfig, Dict[str, Any]]: ...
+def infer_quantisation_config(self: LLM[M, T], quantise: Literal['awq'], **attrs: Any) -> tuple[AwqConfig, Dict[str, Any]]: ...
 @overload
-def infer_quantisation_config(
-  self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any
-) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
+def infer_quantisation_config(self: LLM[M, T], quantise: LiteralQuantise, **attrs: Any) -> tuple[QuantizationConfig, Dict[str, Any]]: ...
diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py
index b44ddf93..9b069f06 100644
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -46,10 +46,7 @@ def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
         (
           'runner_methods',
           {
-            method.name: {
-              'batchable': method.config.batchable,
-              'batch_dim': method.config.batch_dim if method.config.batchable else None,
-            }
+            method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None}
             for method in _.runner_methods
           },
         ),
@@ -114,7 +111,6 @@ class CTranslateRunnable(bentoml.Runnable):
       ).model_dump_json()
       yield bentoml.io.SSE(out).marshal()
 
-
 @registry
 class vLLMRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
@@ -130,9 +126,7 @@ class vLLMRunnable(bentoml.Runnable):
     if dev >= 2:
       num_gpus = min(dev // 2 * 2, dev)
     quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
-    dtype = (
-      torch.float16 if quantise == 'gptq' else llm._torch_dtype
-    )  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
+    dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
     try:
       self.model = vllm.AsyncLLMEngine.from_engine_args(
         vllm.AsyncEngineArgs(
@@ -151,9 +145,7 @@ class vLLMRunnable(bentoml.Runnable):
       )
     except Exception as err:
       traceback.print_exc()
-      raise openllm.exceptions.OpenLLMException(
-        f'Failed to initialise vLLMEngine due to the following error:\n{err}'
-      ) from err
+      raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
 
   @bentoml.Runnable.method(batchable=False)
   async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
@@ -210,9 +202,7 @@ class PyTorchRunnable(bentoml.Runnable):
         if config['logprobs']:  # FIXME: logprobs is not supported
           raise NotImplementedError('Logprobs is yet to be supported with encoder-decoder models.')
         encoder_output = self.model.encoder(input_ids=torch.as_tensor([prompt_token_ids], device=self.device))[0]
-        start_ids = torch.as_tensor(
-          [[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device
-        )
+        start_ids = torch.as_tensor([[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device)
       else:
         start_ids = torch.as_tensor([prompt_token_ids], device=self.device)
 
@@ -240,9 +230,7 @@ class PyTorchRunnable(bentoml.Runnable):
           )
           logits = self.model.lm_head(out[0])
         else:
-          out = self.model(
-            input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True
-          )
+          out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True)
           logits = out.logits
         past_key_values = out.past_key_values
         if logits_processor:
@@ -286,12 +274,7 @@ class PyTorchRunnable(bentoml.Runnable):
 
         tmp_output_ids, rfind_start = output_token_ids[input_len:], 0
         # XXX: Move this to API server
-        text = self.tokenizer.decode(
-          tmp_output_ids,
-          skip_special_tokens=True,
-          spaces_between_special_tokens=False,
-          clean_up_tokenization_spaces=True,
-        )
+        text = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
 
         if len(stop) > 0:
           for it in stop:
diff --git a/openllm-python/src/openllm/_runners.pyi b/openllm-python/src/openllm/_runners.pyi
index 681d8e57..0b422dcc 100644
--- a/openllm-python/src/openllm/_runners.pyi
+++ b/openllm-python/src/openllm/_runners.pyi
@@ -1,19 +1,4 @@
-from typing import (
-  Any,
-  AsyncGenerator,
-  Dict,
-  Generic,
-  Iterable,
-  List,
-  Literal,
-  Optional,
-  Protocol,
-  Tuple,
-  Type,
-  TypeVar,
-  Union,
-  final,
-)
+from typing import Any, AsyncGenerator, Dict, Generic, Iterable, List, Literal, Optional, Protocol, Tuple, Type, TypeVar, Union, final
 
 import torch
 from transformers import PreTrainedModel, PreTrainedTokenizer
@@ -89,11 +74,7 @@ class Runner(Protocol[Mo, To]):
   class generate_iterator(RunnerMethod[List[int], AsyncGenerator[str, None]]):
     @staticmethod
     def async_stream(
-      prompt_token_ids: List[int],
-      request_id: str,
-      stop: Optional[Union[Iterable[str], str]] = ...,
-      adapter_name: Optional[str] = ...,
-      **attrs: Any,
+      prompt_token_ids: List[int], request_id: str, stop: Optional[Union[Iterable[str], str]] = ..., adapter_name: Optional[str] = ..., **attrs: Any
     ) -> AsyncGenerator[str, None]: ...
 
   def __init__(
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 52c2ad55..0a2db77f 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -18,20 +18,12 @@ svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[l
 llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
 
 
-@svc.api(
-  route='/v1/generate',
-  input=JSON.from_sample(llm_model_class.examples()),
-  output=JSON.from_sample(openllm.GenerationOutput.examples()),
-)
+@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
 async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
   return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
 
 
-@svc.api(
-  route='/v1/generate_stream',
-  input=JSON.from_sample(llm_model_class.examples()),
-  output=Text(content_type='text/event-stream'),
-)
+@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
   async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
     yield f'data: {it.model_dump_json()}\n\n'
@@ -76,6 +68,4 @@ def helpers_messages_v1(message: MessagesConverterInput) -> str:
   return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
 
 
-openllm.mount_entrypoints(
-  svc, llm
-)  # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
+openllm.mount_entrypoints(svc, llm)  # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py
index e5fe451d..0febeaa2 100644
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -158,16 +158,12 @@ class _ResourceMixin:
     elif isinstance(spec, list):
       return [str(x) for x in spec]
     else:
-      raise TypeError(
-        f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
-      )
+      raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
 
   @staticmethod
   def validate(cls, val: list[t.Any]) -> None:
     if cls.resource_id == 'amd.com/gpu':
-      raise RuntimeError(
-        "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
-      )
+      raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
     if not all(isinstance(i, str) for i in val):
       raise ValueError('Input list should be all string type.')
 
@@ -311,18 +307,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
           worker_index,
           assigned_resource_per_worker,
         )
-        raise IndexError(
-          f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
-        )
-      assigned_gpu = gpus[
-        assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
-      ]
+        raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
+      assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
       dev = ','.join(assigned_gpu)
     else:
       idx = worker_index // workers_per_resource
       if idx >= len(gpus):
-        raise ValueError(
-          f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
-        )
+        raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
       dev = str(gpus[idx])
     return dev
diff --git a/openllm-python/src/openllm/_strategies.pyi b/openllm-python/src/openllm/_strategies.pyi
index a917267b..8a32d00e 100644
--- a/openllm-python/src/openllm/_strategies.pyi
+++ b/openllm-python/src/openllm/_strategies.pyi
@@ -13,12 +13,7 @@ class CascadingResourceStrategy:
   TODO: Support CloudTPUResource
   """
   @classmethod
-  def get_worker_count(
-    cls,
-    runnable_class: Type[bentoml.Runnable],
-    resource_request: Optional[Dict[str, Any]],
-    workers_per_resource: float,
-  ) -> int:
+  def get_worker_count(cls, runnable_class: Type[bentoml.Runnable], resource_request: Optional[Dict[str, Any]], workers_per_resource: float) -> int:
     """Return the number of workers to be used for the given runnable class.
 
     Note that for all available GPU, the number of workers will always be 1.
@@ -40,7 +35,5 @@ class CascadingResourceStrategy:
       worker_index: The index of the worker, start from 0.
     """
   @staticmethod
-  def transpile_workers_to_cuda_envvar(
-    workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
-  ) -> str:
+  def transpile_workers_to_cuda_envvar(workers_per_resource: Union[float, int], gpus: List[str], worker_index: int) -> str:
     """Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 56ac5732..c9526419 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -38,12 +38,8 @@ def build_editable(path, package='openllm'):
 def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None):
   from . import RefResolver
 
-  openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == 'vllm' else 'openllm'
-  packages = [
-    'scipy',
-    'bentoml[tracing]>=1.1.11,<1.2',
-    f'{openllm_package}>={RefResolver.from_strategy("release").version}',
-  ]  # apparently bnb misses this one
+  openllm_package = 'openllm[vllm]' if llm.__llm_backend__.lower() == "vllm" else "openllm"
+  packages = ['scipy', 'bentoml[tracing]>=1.1.11,<1.2', f'{openllm_package}>={RefResolver.from_strategy("release").version}']  # apparently bnb misses this one
   if adapter_map is not None:
     packages += ['openllm[fine-tune]']
   if extra_dependencies is not None:
@@ -61,18 +57,7 @@ def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=N
 def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, serialisation):
   from openllm_cli.entrypoint import process_environ
 
-  environ = process_environ(
-    llm.config,
-    llm.config['timeout'],
-    1.0,
-    None,
-    True,
-    llm.model_id,
-    None,
-    llm._serialisation,
-    llm,
-    use_current_env=False,
-  )
+  environ = process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm, use_current_env=False)
   # XXX: We need to quote this so that the envvar in container recognize as valid json
   environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'"
   environ.pop('BENTOML_HOME', None)  # NOTE: irrelevant in container
@@ -101,10 +86,7 @@ def create_bento(
     'start_name': llm.config['start_name'],
     'base_name_or_path': llm.model_id,
     'bundler': 'openllm.bundle',
-    **{
-      f'{package.replace("-", "_")}_version': importlib.metadata.version(package)
-      for package in {'openllm', 'openllm-core', 'openllm-client'}
-    },
+    **{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
   })
   if adapter_map:
     labels.update(adapter_map)
diff --git a/openllm-python/src/openllm/bundle/_package.pyi b/openllm-python/src/openllm/bundle/_package.pyi
index fc7c07fc..0a6f2f38 100644
--- a/openllm-python/src/openllm/bundle/_package.pyi
+++ b/openllm-python/src/openllm/bundle/_package.pyi
@@ -13,10 +13,7 @@ from .._llm import LLM
 
 def build_editable(path: str, package: LiteralString) -> Optional[str]: ...
 def construct_python_options(
-  llm: LLM[M, T],
-  llm_fs: FS,
-  extra_dependencies: Optional[Tuple[str, ...]] = ...,
-  adapter_map: Optional[Dict[str, str]] = ...,
+  llm: LLM[M, T], llm_fs: FS, extra_dependencies: Optional[Tuple[str, ...]] = ..., adapter_map: Optional[Dict[str, str]] = ...
 ) -> PythonOptions: ...
 def construct_docker_options(
   llm: LLM[M, T],
diff --git a/openllm-python/src/openllm/entrypoints/__init__.py b/openllm-python/src/openllm/entrypoints/__init__.py
index 06e63854..7ffc455a 100644
--- a/openllm-python/src/openllm/entrypoints/__init__.py
+++ b/openllm-python/src/openllm/entrypoints/__init__.py
@@ -11,7 +11,5 @@ def mount_entrypoints(svc, llm):
   return svc
 
 
-__lazy = LazyModule(
-  __name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
-)
+__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
 __all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py
index 35befd91..395f6c46 100644
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -501,11 +501,7 @@ class OpenLLMSchemaGenerator(SchemaGenerator):
         endpoints_info.extend(sub_endpoints)
       elif not isinstance(route, Route) or not route.include_in_schema:
         continue
-      elif (
-        inspect.isfunction(route.endpoint)
-        or inspect.ismethod(route.endpoint)
-        or isinstance(route.endpoint, functools.partial)
-      ):
+      elif inspect.isfunction(route.endpoint) or inspect.ismethod(route.endpoint) or isinstance(route.endpoint, functools.partial):
         endpoint = route.endpoint.func if isinstance(route.endpoint, functools.partial) else route.endpoint
         path = self._remove_converter(route.path)
         for method in route.methods or ['GET']:
@@ -552,9 +548,7 @@ def get_generator(title, components=None, tags=None, inject=True):
 
 def component_schema_generator(attr_cls, description=None):
   schema = {'type': 'object', 'required': [], 'properties': {}, 'title': attr_cls.__name__}
-  schema['description'] = first_not_none(
-    getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}'
-  )
+  schema['description'] = first_not_none(getattr(attr_cls, '__doc__', None), description, default=f'Generated components for {attr_cls.__name__}')
   for field in attr.fields(attr.resolve_types(attr_cls)):
     attr_type = field.type
     origin_type = t.get_origin(attr_type)
@@ -596,10 +590,7 @@ def component_schema_generator(attr_cls, description=None):
 
 
 _SimpleSchema = types.new_class(
-  '_SimpleSchema',
-  (object,),
-  {},
-  lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it}),
+  '_SimpleSchema', (object,), {}, lambda ns: ns.update({'__init__': lambda self, it: setattr(self, 'it', it), 'asdict': lambda self: self.it})
 )
 
 
diff --git a/openllm-python/src/openllm/entrypoints/_openapi.pyi b/openllm-python/src/openllm/entrypoints/_openapi.pyi
index 4ecb9760..e5502636 100644
--- a/openllm-python/src/openllm/entrypoints/_openapi.pyi
+++ b/openllm-python/src/openllm/entrypoints/_openapi.pyi
@@ -17,13 +17,8 @@ class OpenLLMSchemaGenerator:
 
 def apply_schema(func: Callable[P, Any], **attrs: Any) -> Callable[P, Any]: ...
 def add_schema_definitions(func: Callable[P, Any]) -> Callable[P, Any]: ...
-def append_schemas(
-  svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...
-) -> Service: ...
+def append_schemas(svc: Service, generated_schema: Dict[str, Any], tags_order: Literal['prepend', 'append'] = ..., inject: bool = ...) -> Service: ...
 def component_schema_generator(attr_cls: Type[AttrsInstance], description: Optional[str] = ...) -> Dict[str, Any]: ...
 def get_generator(
-  title: str,
-  components: Optional[List[Type[AttrsInstance]]] = ...,
-  tags: Optional[List[Dict[str, Any]]] = ...,
-  inject: bool = ...,
+  title: str, components: Optional[List[Type[AttrsInstance]]] = ..., tags: Optional[List[Dict[str, Any]]] = ..., inject: bool = ...
 ) -> OpenLLMSchemaGenerator: ...
diff --git a/openllm-python/src/openllm/entrypoints/cohere.py b/openllm-python/src/openllm/entrypoints/cohere.py
index 02130bdb..ded3b2a8 100644
--- a/openllm-python/src/openllm/entrypoints/cohere.py
+++ b/openllm-python/src/openllm/entrypoints/cohere.py
@@ -59,10 +59,7 @@ def error_response(status_code, message):
 async def check_model(request, model):
   if request.model is None or request.model == model:
     return None
-  return error_response(
-    HTTPStatus.NOT_FOUND,
-    f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
-  )
+  return error_response(HTTPStatus.NOT_FOUND, f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.")
 
 
 def mount_to_svc(svc, llm):
@@ -71,17 +68,13 @@ def mount_to_svc(svc, llm):
     routes=[
       Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
       Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
-      Route(
-        '/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
-      ),
+      Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
     ],
   )
   mount_path = '/cohere'
 
   svc.mount_asgi_app(app, path=mount_path)
-  return append_schemas(
-    svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
-  )
+  return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)
 
 
 @add_schema_definitions
@@ -140,18 +133,14 @@ async def cohere_generate(req, llm):
     if final_result is None:
       return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
     final_result = final_result.with_options(
-      outputs=[
-        output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
-        for output in final_result.outputs
-      ]
+      outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
     )
     return JSONResponse(
       converter.unstructure(
         Generations(
           id=request_id,
           generations=[
-            Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason)
-            for output in final_result.outputs
+            Generation(id=request_id, text=output.text, prompt=prompt, finish_reason=output.finish_reason) for output in final_result.outputs
           ],
         )
       ),
@@ -258,9 +247,7 @@ async def cohere_chat(req, llm):
       final_result = res
     if final_result is None:
       return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
-    final_result = final_result.with_options(
-      outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)]
-    )
+    final_result = final_result.with_options(outputs=[final_result.outputs[0].with_options(text=''.join(texts), token_ids=token_ids)])
     num_prompt_tokens, num_response_tokens = len(final_result.prompt_token_ids), len(token_ids)
     return JSONResponse(
       converter.unstructure(
diff --git a/openllm-python/src/openllm/entrypoints/cohere.pyi b/openllm-python/src/openllm/entrypoints/cohere.pyi
index 51f8d46f..912f1185 100644
--- a/openllm-python/src/openllm/entrypoints/cohere.pyi
+++ b/openllm-python/src/openllm/entrypoints/cohere.pyi
@@ -14,8 +14,6 @@ from ..protocol.cohere import CohereChatRequest, CohereGenerateRequest
 def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
 def jsonify_attr(obj: AttrsInstance) -> str: ...
 def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
-async def check_model(
-  request: Union[CohereGenerateRequest, CohereChatRequest], model: str
-) -> Optional[JSONResponse]: ...
+async def check_model(request: Union[CohereGenerateRequest, CohereChatRequest], model: str) -> Optional[JSONResponse]: ...
 async def cohere_generate(req: Request, llm: LLM[M, T]) -> Response: ...
 async def cohere_chat(req: Request, llm: LLM[M, T]) -> Response: ...
diff --git a/openllm-python/src/openllm/entrypoints/hf.py b/openllm-python/src/openllm/entrypoints/hf.py
index fc544d4d..d0d5ad0b 100644
--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -37,10 +37,7 @@ def mount_to_svc(svc, llm):
 
 
 def error_response(status_code, message):
-  return JSONResponse(
-    converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
-    status_code=status_code.value,
-  )
+  return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)
 
 
 @add_schema_definitions
@@ -56,9 +53,7 @@ async def hf_agent(req, llm):
   stop = request.parameters.pop('stop', [])
   try:
     result = await llm.generate(request.inputs, stop=stop, **request.parameters)
-    return JSONResponse(
-      converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
-    )
+    return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
   except Exception as err:
     logger.error('Error while generating: %s', err)
     return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')
diff --git a/openllm-python/src/openllm/entrypoints/openai.py b/openllm-python/src/openllm/entrypoints/openai.py
index bc33ad40..72f7d350 100644
--- a/openllm-python/src/openllm/entrypoints/openai.py
+++ b/openllm-python/src/openllm/entrypoints/openai.py
@@ -61,11 +61,7 @@ def jsonify_attr(obj):
 
 def error_response(status_code, message):
   return JSONResponse(
-    {
-      'error': converter.unstructure(
-        ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value))
-      )
-    },
+    {'error': converter.unstructure(ErrorResponse(message=message, type='invalid_request_error', code=str(status_code.value)))},
     status_code=status_code.value,
   )
 
@@ -99,11 +95,7 @@ def create_logprobs(token_ids, top_logprobs, num_output_top_logprobs=None, initi
       logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
     last_token_len = len(token)
     if num_output_top_logprobs:
-      logprobs.top_logprobs.append(
-        {llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()}
-        if step_top_logprobs
-        else None
-      )
+      logprobs.top_logprobs.append({llm.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items()} if step_top_logprobs else None)
   return logprobs
 
 
@@ -114,14 +106,8 @@ def mount_to_svc(svc, llm):
   app = Starlette(
     debug=True,
     routes=[
-      Route(
-        '/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']
-      ),
-      Route(
-        '/completions',
-        functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm),
-        methods=['POST'],
-      ),
+      Route('/models', functools.partial(apply_schema(list_models, __model_id__=llm.llm_type), llm=llm), methods=['GET']),
+      Route('/completions', functools.partial(apply_schema(completions, __model_id__=llm.llm_type), llm=llm), methods=['POST']),
       Route(
         '/chat/completions',
         functools.partial(
@@ -146,9 +132,7 @@ def mount_to_svc(svc, llm):
 # GET /v1/models
 @add_schema_definitions
 def list_models(_, llm):
-  return JSONResponse(
-    converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value
-  )
+  return JSONResponse(converter.unstructure(ModelList(data=[ModelCard(id=llm.llm_type)])), status_code=HTTPStatus.OK.value)
 
 
 # POST /v1/chat/completions
@@ -182,9 +166,7 @@ async def chat_completions(req, llm):
   config = llm.config.compatible_options(request)
 
   def get_role() -> str:
-    return (
-      request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'
-    )  # TODO: Support custom role here.
+    return request.messages[-1]['role'] if not request.add_generation_prompt else 'assistant'  # TODO: Support custom role here.
 
   try:
     result_generator = llm.generate_iterator(prompt, request_id=request_id, **config)
@@ -198,9 +180,7 @@ async def chat_completions(req, llm):
       id=request_id,
       created=created_time,
       model=model_name,
-      choices=[
-        ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)
-      ],
+      choices=[ChatCompletionResponseStreamChoice(index=index, delta=Delta(content=text), finish_reason=finish_reason)],
     )
     if usage is not None:
       response.usage = usage
@@ -251,17 +231,12 @@ async def chat_completions(req, llm):
     if final_result is None:
       return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
     final_result = final_result.with_options(
-      outputs=[
-        output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
-        for output in final_result.outputs
-      ]
+      outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
     )
 
     role = get_role()
     choices = [
-      ChatCompletionResponseChoice(
-        index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason
-      )
+      ChatCompletionResponseChoice(index=output.index, message=ChatMessage(role=role, content=output.text), finish_reason=output.finish_reason)
       for output in final_result.outputs
     ]
     if request.echo:
@@ -275,9 +250,7 @@ async def chat_completions(req, llm):
     num_prompt_tokens = len(final_result.prompt_token_ids)
     num_generated_tokens = sum(len(output.token_ids) for output in final_result.outputs)
     usage = UsageInfo(num_prompt_tokens, num_generated_tokens, num_prompt_tokens + num_generated_tokens)
-    response = ChatCompletionResponse(
-      id=request_id, created=created_time, model=model_name, usage=usage, choices=choices
-    )
+    response = ChatCompletionResponse(id=request_id, created=created_time, model=model_name, usage=usage, choices=choices)
     return JSONResponse(converter.unstructure(response), status_code=HTTPStatus.OK.value)
   except Exception as err:
     traceback.print_exc()
@@ -369,13 +342,7 @@ async def completions(req, llm):
               top_logprobs = res.prompt_logprobs
           previous_echo[i] = True
         if request.logprobs is not None:
-          logprobs = create_logprobs(
-            output.token_ids,
-            output.logprobs[previous_num_tokens[i] :],
-            request.logprobs,
-            len(previous_texts[i]),
-            llm=llm,
-          )
+          logprobs = create_logprobs(output.token_ids, output.logprobs[previous_num_tokens[i] :], request.logprobs, len(previous_texts[i]), llm=llm)
         previous_num_tokens[i] += len(output.token_ids)
         previous_texts[i] += output.text
         yield f'data: {create_stream_response_json(index=i, text=output.text, logprobs=logprobs, finish_reason=output.finish_reason)}\n\n'
@@ -402,10 +369,7 @@ async def completions(req, llm):
     if final_result is None:
       return error_response(HTTPStatus.BAD_REQUEST, 'No response from model.')
     final_result = final_result.with_options(
-      outputs=[
-        output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
-        for output in final_result.outputs
-      ]
+      outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs]
     )
 
     choices = []
@@ -428,9 +392,7 @@ async def completions(req, llm):
           output_text = prompt_text + output_text
       else:
         output_text = prompt_text
-      choice_data = CompletionResponseChoice(
-        index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason
-      )
+      choice_data = CompletionResponseChoice(index=output.index, text=output_text, logprobs=logprobs, finish_reason=output.finish_reason)
       choices.append(choice_data)
 
     num_prompt_tokens = len(final_result.prompt_token_ids)
diff --git a/openllm-python/src/openllm/entrypoints/openai.pyi b/openllm-python/src/openllm/entrypoints/openai.pyi
index 1b69cade..829728ba 100644
--- a/openllm-python/src/openllm/entrypoints/openai.pyi
+++ b/openllm-python/src/openllm/entrypoints/openai.pyi
@@ -14,9 +14,7 @@ from ..protocol.openai import ChatCompletionRequest, CompletionRequest, LogProbs
 def mount_to_svc(svc: Service, llm: LLM[M, T]) -> Service: ...
 def jsonify_attr(obj: AttrsInstance) -> str: ...
 def error_response(status_code: HTTPStatus, message: str) -> JSONResponse: ...
-async def check_model(
-  request: Union[CompletionRequest, ChatCompletionRequest], model: str
-) -> Optional[JSONResponse]: ...
+async def check_model(request: Union[CompletionRequest, ChatCompletionRequest], model: str) -> Optional[JSONResponse]: ...
 def create_logprobs(
   token_ids: List[int],
   top_logprobs: List[Dict[int, float]],  #
diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py
index 64529ba2..89586d7e 100644
--- a/openllm-python/src/openllm/serialisation/__init__.py
+++ b/openllm-python/src/openllm/serialisation/__init__.py
@@ -30,9 +30,7 @@ def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
           'For example: "bentoml.transformers.save_model(..., custom_objects={\'tokenizer\': tokenizer})"'
         ) from None
   else:
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-      bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs
-    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.trust_remote_code, **tokenizer_attrs)
 
   if tokenizer.pad_token_id is None:
     if config.pad_token_id is not None:
diff --git a/openllm-python/src/openllm/serialisation/_helpers.py b/openllm-python/src/openllm/serialisation/_helpers.py
index 5557bc6b..c155a844 100644
--- a/openllm-python/src/openllm/serialisation/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/_helpers.py
@@ -29,9 +29,7 @@ def patch_correct_tag(llm, config, _revision=None) -> None:
     if _revision is None and llm.tag.version is not None:
       _revision = llm.tag.version
     if llm.tag.version is None:
-      _object_setattr(
-        llm, '_tag', attr.evolve(llm.tag, version=_revision)
-      )  # HACK: This copies the correct revision into llm.tag
+      _object_setattr(llm, '_tag', attr.evolve(llm.tag, version=_revision))  # HACK: This copies the correct revision into llm.tag
     if llm._revision is None:
       _object_setattr(llm, '_revision', _revision)  # HACK: This copies the correct revision into llm._model_version
 
@@ -47,9 +45,7 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
     if trust_remote_code:
       auto_map = getattr(config, 'auto_map', {})
       if not auto_map:
-        raise RuntimeError(
-          f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}'
-        )
+        raise RuntimeError(f'Failed to determine the architecture from both `auto_map` and `architectures` from {llm.model_id}')
       autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
       if autoclass not in auto_map:
         raise RuntimeError(
@@ -60,10 +56,7 @@ def _create_metadata(llm, config, safe_serialisation, trust_remote_code, metadat
       raise RuntimeError(
         'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
       )
-  metadata.update({
-    '_pretrained_class': architectures[0],
-    '_revision': get_hash(config) if not llm.local else llm.revision,
-  })
+  metadata.update({'_pretrained_class': architectures[0], '_revision': get_hash(config) if not llm.local else llm.revision})
   return metadata
 
 
@@ -144,9 +137,7 @@ def save_model(
       bentomodel.flush()
       bentomodel.save(_model_store)
       openllm.utils.analytics.track(
-        openllm.utils.analytics.ModelSaveEvent(
-          module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024
-        )
+        openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)
       )
     finally:
       bentomodel.exit_cloudpickle_context(imported_modules)
diff --git a/openllm-python/src/openllm/serialisation/_helpers.pyi b/openllm-python/src/openllm/serialisation/_helpers.pyi
index 7f2628a1..4516599d 100644
--- a/openllm-python/src/openllm/serialisation/_helpers.pyi
+++ b/openllm-python/src/openllm/serialisation/_helpers.pyi
@@ -10,9 +10,7 @@ from openllm_core._typing_compat import M, T
 from .._llm import LLM
 
 def get_hash(config: transformers.PretrainedConfig) -> str: ...
-def patch_correct_tag(
-  llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...
-) -> None: ...
+def patch_correct_tag(llm: LLM[M, T], config: transformers.PretrainedConfig, _revision: Optional[str] = ...) -> None: ...
 @contextmanager
 def save_model(
   llm: LLM[M, T],
diff --git a/openllm-python/src/openllm/serialisation/ctranslate/__init__.py b/openllm-python/src/openllm/serialisation/ctranslate/__init__.py
index 471e4418..b4a86634 100644
--- a/openllm-python/src/openllm/serialisation/ctranslate/__init__.py
+++ b/openllm-python/src/openllm/serialisation/ctranslate/__init__.py
@@ -12,9 +12,7 @@ from .._helpers import patch_correct_tag, save_model
 from ..transformers._helpers import get_tokenizer, process_config
 
 if not is_ctranslate_available():
-  raise RuntimeError(
-    "'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'"
-  )
+  raise RuntimeError("'ctranslate2' is required to use with backend 'ctranslate'. Install it with 'pip install \"openllm[ctranslate]\"'")
 
 import ctranslate2
 from ctranslate2.converters.transformers import TransformersConverter
@@ -44,17 +42,11 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
   config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
   patch_correct_tag(llm, config)
   tokenizer = get_tokenizer(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
-  with save_model(
-    llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]
-  ) as save_metadata:
+  with save_model(llm, config, False, trust_remote_code, 'ctranslate', [importlib.import_module(tokenizer.__module__)]) as save_metadata:
     bentomodel, _ = save_metadata
     if llm._local:
       shutil.copytree(
-        llm.model_id,
-        bentomodel.path,
-        symlinks=False,
-        ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'),
-        dirs_exist_ok=True,
+        llm.model_id, bentomodel.path, symlinks=False, ignore=shutil.ignore_patterns('.git', 'venv', '__pycache__', '.venv'), dirs_exist_ok=True
       )
     else:
       TransformersConverter(
@@ -74,9 +66,7 @@ def get(llm):
     model = bentoml.models.get(llm.tag)
     backend = model.info.labels['backend']
     if backend != llm.__llm_backend__:
-      raise OpenLLMException(
-        f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
-      )
+      raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
     patch_correct_tag(
       llm,
       transformers.AutoConfig.from_pretrained(model.path_of('/hf/'), trust_remote_code=llm.trust_remote_code),
diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index ce9916ca..47f03478 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -12,7 +12,7 @@ from .._helpers import patch_correct_tag, save_model
 
 logger = logging.getLogger(__name__)
 
-__all__ = ['get', 'import_model', 'load_model']
+__all__ = ['import_model', 'get', 'load_model']
 _object_setattr = object.__setattr__
 
 
@@ -44,13 +44,7 @@ def import_model(llm, *decls, trust_remote_code, **attrs):
         f.write(orjson.dumps(config.quantization_config, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode())
     if llm._local:  # possible local path
       model = infer_autoclass_from_llm(llm, config).from_pretrained(
-        llm.model_id,
-        *decls,
-        local_files_only=True,
-        config=config,
-        trust_remote_code=trust_remote_code,
-        **hub_attrs,
-        **attrs,
+        llm.model_id, *decls, local_files_only=True, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs
       )
       # for trust_remote_code to work
       bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
@@ -74,9 +68,7 @@ def get(llm):
     model = bentoml.models.get(llm.tag)
     backend = model.info.labels['backend']
     if backend != llm.__llm_backend__:
-      raise OpenLLMException(
-        f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'."
-      )
+      raise OpenLLMException(f"'{model.tag!s}' was saved with backend '{backend}', while loading with '{llm.__llm_backend__}'.")
     patch_correct_tag(
       llm,
       transformers.AutoConfig.from_pretrained(model.path, trust_remote_code=llm.trust_remote_code),
@@ -132,9 +124,7 @@ def load_model(llm, *decls, **attrs):
       )
     except Exception as err:
       logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
-      model = auto_class.from_pretrained(
-        llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs
-      )
+      model = auto_class.from_pretrained(llm.bentomodel.path, device_map=device_map, trust_remote_code=llm.trust_remote_code, **attrs)
   else:
     try:
       model = auto_class.from_pretrained(
@@ -149,12 +139,7 @@ def load_model(llm, *decls, **attrs):
     except Exception as err:
       logger.debug("Failed to load model with 'use_flash_attention_2' (lookup for traceback):\n%s", err)
       model = auto_class.from_pretrained(
-        llm.bentomodel.path,
-        *decls,
-        config=config,
-        trust_remote_code=llm.trust_remote_code,
-        device_map=device_map,
-        **attrs,
+        llm.bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **attrs
       )
   check_unintialised_params(model)
   return model
diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
index 9d436540..9b8500e1 100644
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -6,9 +6,7 @@ logger = logging.getLogger(__name__)
 
 
 def get_tokenizer(model_id_or_path, trust_remote_code, **attrs):
-  tokenizer = transformers.AutoTokenizer.from_pretrained(
-    model_id_or_path, trust_remote_code=trust_remote_code, **attrs
-  )
+  tokenizer = transformers.AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code, **attrs)
   if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
   return tokenizer
diff --git a/openllm-python/src/openllm/utils.py b/openllm-python/src/openllm/utils.py
index 890f75d8..3d504fb0 100644
--- a/openllm-python/src/openllm/utils.py
+++ b/openllm-python/src/openllm/utils.py
@@ -1,6 +1,6 @@
 import functools, importlib.metadata, openllm_core
 
-__all__ = ['available_devices', 'device_count', 'generate_labels']
+__all__ = ['generate_labels', 'available_devices', 'device_count']
 
 
 def generate_labels(llm):
diff --git a/openllm-python/src/openllm_cli/_factory.py b/openllm-python/src/openllm_cli/_factory.py
index cbbff21e..c173977a 100644
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -5,25 +5,12 @@ from bentoml_cli.utils import BentoMLCommandGroup
 from click import shell_completion as sc
 
 from openllm_core._configuration import LLMConfig
-from openllm_core._typing_compat import (
-  Concatenate,
-  DictStrAny,
-  LiteralBackend,
-  LiteralSerialisation,
-  ParamSpec,
-  AnyCallable,
-  get_literal_args,
-)
+from openllm_core._typing_compat import Concatenate, DictStrAny, LiteralBackend, LiteralSerialisation, ParamSpec, AnyCallable, get_literal_args
 from openllm_core.utils import DEBUG, compose, dantic, resolve_user_filepath
 
 
 class _OpenLLM_GenericInternalConfig(LLMConfig):
-  __config__ = {
-    'name_type': 'lowercase',
-    'default_id': 'openllm/generic',
-    'model_ids': ['openllm/generic'],
-    'architecture': 'PreTrainedModel',
-  }
+  __config__ = {'name_type': 'lowercase', 'default_id': 'openllm/generic', 'model_ids': ['openllm/generic'], 'architecture': 'PreTrainedModel'}
 
   class GenerationConfig:
     top_k: int = 15
@@ -50,20 +37,11 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
 
 
 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [
-    sc.CompletionItem(inflection.dasherize(it), help='Model')
-    for it in openllm.CONFIG_MAPPING
-    if it.startswith(incomplete)
-  ]
+  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
 
 
 def parse_config_options(
-  config: LLMConfig,
-  server_timeout: int,
-  workers_per_resource: float,
-  device: t.Tuple[str, ...] | None,
-  cors: bool,
-  environ: DictStrAny,
+  config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny
 ) -> DictStrAny:
   # TODO: Support amd.com/gpu on k8s
   _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
@@ -78,21 +56,14 @@ def parse_config_options(
   if device:
     if len(device) > 1:
       _bentoml_config_options_opts.extend([
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
-        for idx, dev in enumerate(device)
+        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)
       ])
     else:
-      _bentoml_config_options_opts.append(
-        f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]'
-      )
+      _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
   if cors:
+    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
     _bentoml_config_options_opts.extend([
-      'api_server.http.cors.enabled=true',
-      'api_server.http.cors.access_control_allow_origins="*"',
-    ])
-    _bentoml_config_options_opts.extend([
-      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
-      for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
+      f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
     ])
   _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
   environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
@@ -171,9 +142,7 @@ def start_decorator(fn: FC) -> FC:
   return composed(fn)
 
 
-def parse_device_callback(
-  _: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None
-) -> t.Tuple[str, ...] | None:
+def parse_device_callback(_: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
   if value is None:
     return value
   el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -192,19 +161,13 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
 def parse_serve_args() -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
   from bentoml_cli.cli import cli
 
-  group = cog.optgroup.group(
-    'Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]'
-  )
+  group = cog.optgroup.group('Start a HTTP server options', help='Related to serving the model [synonymous to `bentoml serve-http`]')
 
   def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
     serve_command = cli.commands['serve']
     # The first variable is the argument bento
     # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [
-      p
-      for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
-      if p.name not in _IGNORED_OPTIONS
-    ]
+    serve_options = [p for p in serve_command.params[1 : -BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
     for options in reversed(serve_options):
       attrs = options.to_info_dict()
       # we don't need param_type_name, since it should all be options
@@ -258,13 +221,7 @@ def adapter_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callab
 
 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
-    '--cors/--no-cors',
-    show_default=True,
-    default=False,
-    envvar='OPENLLM_CORS',
-    show_envvar=True,
-    help='Enable CORS for the server.',
-    **attrs,
+    '--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs
   )(f)
 
 
@@ -318,12 +275,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
 
 
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument(
-    'model_name',
-    type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
-    required=required,
-    **attrs,
-  )(f)
+  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
 
 
 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
@@ -361,9 +313,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
   )(f)
 
 
-def workers_per_resource_option(
-  f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any
-) -> t.Callable[[FC], FC]:
+def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
     '--workers-per-resource',
     default=None,
@@ -431,9 +381,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
       float(value)  # type: ignore[arg-type]
     except ValueError:
       raise click.BadParameter(
-        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
-        ctx,
-        param,
+        f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param
       ) from None
     else:
       return value
diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py
index bd3b3f81..70d24339 100644
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -69,10 +69,7 @@ def _start(
   if timeout:
     args.extend(['--server-timeout', str(timeout)])
   if workers_per_resource:
-    args.extend([
-      '--workers-per-resource',
-      str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource,
-    ])
+    args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
   if device and not os.environ.get('CUDA_VISIBLE_DEVICES'):
     args.extend(['--device', ','.join(device)])
   if quantize:
@@ -80,11 +77,7 @@ def _start(
   if cors:
     args.append('--cors')
   if adapter_map:
-    args.extend(
-      list(
-        itertools.chain.from_iterable([['--adapter-id', f"{k}{':' + v if v else ''}"] for k, v in adapter_map.items()])
-      )
-    )
+    args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
   if additional_args:
     args.extend(additional_args)
   if __test__:
@@ -155,9 +148,7 @@ def _build(
     '--machine',
     '--quiet',
     '--serialisation',
-    first_not_none(
-      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-    ),
+    first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
   ]
   if quantize:
     args.extend(['--quantize', quantize])
@@ -174,7 +165,7 @@ def _build(
   if overwrite:
     args.append('--overwrite')
   if adapter_map:
-    args.extend([f"--adapter-id={k}{':' + v if v is not None else ''}" for k, v in adapter_map.items()])
+    args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
   if model_version:
     args.extend(['--model-version', model_version])
   if bento_version:
@@ -274,4 +265,4 @@ start, build, import_model, list_models = (
   codegen.gen_sdk(_import_model),
   codegen.gen_sdk(_list_models),
 )
-__all__ = ['build', 'import_model', 'list_models', 'start']
+__all__ = ['start', 'build', 'import_model', 'list_models']
diff --git a/openllm-python/src/openllm_cli/entrypoint.py b/openllm-python/src/openllm_cli/entrypoint.py
index d6a134da..6506dabd 100644
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -43,15 +43,7 @@ from openllm_core.utils import (
 )
 
 from . import termui
-from ._factory import (
-  FC,
-  _AnyCallable,
-  machine_option,
-  model_name_argument,
-  parse_config_options,
-  start_decorator,
-  optimization_decorator,
-)
+from ._factory import FC, _AnyCallable, machine_option, model_name_argument, parse_config_options, start_decorator, optimization_decorator
 
 if t.TYPE_CHECKING:
   import torch
@@ -103,18 +95,12 @@ def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
         'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
       )
     if build:
-      logger.info(
-        "Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
-      )
+      logger.info("Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally.")
 
 
 class Extensions(click.MultiCommand):
   def list_commands(self, ctx: click.Context) -> list[str]:
-    return sorted([
-      filename[:-3]
-      for filename in os.listdir(_EXT_FOLDER)
-      if filename.endswith('.py') and not filename.startswith('__')
-    ])
+    return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith('.py') and not filename.startswith('__')])
 
   def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None:
     try:
@@ -131,41 +117,19 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
   def common_params(f: t.Callable[P, t.Any]) -> t.Callable[[FC], FC]:
     # The following logics is similar to one of BentoMLCommandGroup
     @cog.optgroup.group(name='Global options', help='Shared globals options for all OpenLLM CLI.')  # type: ignore[misc]
+    @cog.optgroup.option('-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True)
     @cog.optgroup.option(
-      '-q', '--quiet', envvar=QUIET_ENV_VAR, is_flag=True, default=False, help='Suppress all output.', show_envvar=True
+      '--debug', '--verbose', 'debug', envvar=DEBUG_ENV_VAR, is_flag=True, default=False, help='Print out debug logs.', show_envvar=True
     )
     @cog.optgroup.option(
-      '--debug',
-      '--verbose',
-      'debug',
-      envvar=DEBUG_ENV_VAR,
-      is_flag=True,
-      default=False,
-      help='Print out debug logs.',
-      show_envvar=True,
+      '--do-not-track', is_flag=True, default=False, envvar=analytics.OPENLLM_DO_NOT_TRACK, help='Do not send usage info', show_envvar=True
     )
     @cog.optgroup.option(
-      '--do-not-track',
-      is_flag=True,
-      default=False,
-      envvar=analytics.OPENLLM_DO_NOT_TRACK,
-      help='Do not send usage info',
-      show_envvar=True,
-    )
-    @cog.optgroup.option(
-      '--context',
-      'cloud_context',
-      envvar='BENTOCLOUD_CONTEXT',
-      type=click.STRING,
-      default=None,
-      help='BentoCloud context name.',
-      show_envvar=True,
+      '--context', 'cloud_context', envvar='BENTOCLOUD_CONTEXT', type=click.STRING, default=None, help='BentoCloud context name.', show_envvar=True
     )
     @click.pass_context
     @functools.wraps(f)
-    def wrapper(
-      ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs
-    ) -> t.Any:
+    def wrapper(ctx: click.Context, quiet: bool, debug: bool, cloud_context: str | None, *args: P.args, **attrs: P.kwargs) -> t.Any:
       ctx.obj = GlobalOptions(cloud_context=cloud_context)
       if quiet:
         set_quiet_mode(True)
@@ -179,9 +143,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
     return wrapper
 
   @staticmethod
-  def usage_tracking(
-    func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any
-  ) -> t.Callable[Concatenate[bool, P], t.Any]:
+  def usage_tracking(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[Concatenate[bool, P], t.Any]:
     command_name = attrs.get('name', func.__name__)
 
     @functools.wraps(func)
@@ -240,9 +202,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
       _memo = getattr(wrapped, '__click_params__', None)
       if _memo is None:
         raise ValueError('Click command not register correctly.')
-      _object_setattr(
-        wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS]
-      )
+      _object_setattr(wrapped, '__click_params__', _memo[-self.NUMBER_OF_COMMON_PARAMS :] + _memo[: -self.NUMBER_OF_COMMON_PARAMS])
       # NOTE: we need to call super of super to avoid conflict with BentoMLCommandGroup command setup
       cmd = super(BentoMLCommandGroup, self).command(*args, **kwargs)(wrapped)
       # NOTE: add aliases to a given commands if it is specified.
@@ -250,7 +210,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
         if not cmd.name:
           raise ValueError('name is required when aliases are available.')
         self._commands[cmd.name] = aliases
-        self._aliases.update(dict.fromkeys(aliases, cmd.name))
+        self._aliases.update({alias: cmd.name for alias in aliases})
       return cmd
 
     return decorator
@@ -317,12 +277,7 @@ def cli() -> None:
   """
 
 
-@cli.command(
-  context_settings=termui.CONTEXT_SETTINGS,
-  name='start',
-  aliases=['start-http'],
-  short_help='Start a LLMServer for any supported LLM.',
-)
+@cli.command(context_settings=termui.CONTEXT_SETTINGS, name='start', aliases=['start-http'], short_help='Start a LLMServer for any supported LLM.')
 @click.argument('model_id', type=click.STRING, metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]', required=True)
 @click.option(
   '--model-id',
@@ -375,9 +330,7 @@ def start_command(
   ```
   """
   if backend == 'pt':
-    logger.warning(
-      'PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.'
-    )
+    logger.warning('PyTorch backend is deprecated and will be removed in future releases. Make sure to use vLLM instead.')
   if model_id in openllm.CONFIG_MAPPING:
     _model_name = model_id
     if deprecated_model_id is not None:
@@ -395,17 +348,11 @@ def start_command(
 
   from openllm.serialisation.transformers.weights import has_safetensors_weights
 
-  serialisation = first_not_none(
-    serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-  )
+  serialisation = first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
 
   if serialisation == 'safetensors' and quantize is not None:
     logger.warning("'--quantize=%s' might not work with 'safetensors' serialisation format.", quantize)
-    logger.warning(
-      "Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.",
-      model_id,
-      serialisation,
-    )
+    logger.warning("Make sure to check out '%s' repository to see if the weights is in '%s' format if unsure.", model_id, serialisation)
     logger.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
 
   import torch
@@ -433,9 +380,7 @@ def start_command(
   config, server_attrs = llm.config.model_validate_click(**attrs)
   server_timeout = first_not_none(server_timeout, default=config['timeout'])
   server_attrs.update({'working_dir': pkg.source_locations('openllm'), 'timeout': server_timeout})
-  development = server_attrs.pop(
-    'development'
-  )  # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
+  development = server_attrs.pop('development')  # XXX: currently, theres no development args in bentoml.Server. To be fixed upstream.
   server_attrs.setdefault('production', not development)
 
   start_env = process_environ(
@@ -465,12 +410,8 @@ def start_command(
   return config
 
 
-def process_environ(
-  config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True
-):
-  environ = parse_config_options(
-    config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {}
-  )
+def process_environ(config, server_timeout, wpr, device, cors, model_id, adapter_map, serialisation, llm, use_current_env=True):
+  environ = parse_config_options(config, server_timeout, wpr, device, cors, os.environ.copy() if use_current_env else {})
   environ.update({
     'OPENLLM_MODEL_ID': model_id,
     'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
@@ -515,8 +456,7 @@ def build_bento_instruction(llm, model_id, serialisation, adapter_map):
     cmd_name += f' --serialization {serialisation}'
   if adapter_map is not None:
     cmd_name += ' ' + ' '.join([
-      f'--adapter-id {s}'
-      for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
+      f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
     ])
   if not openllm.utils.get_quiet_mode():
     termui.info(f"🚀Tip: run '{cmd_name}' to create a BentoLLM for '{model_id}'")
@@ -551,12 +491,8 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
   if return_process:
     return process
   stop_event = threading.Event()
-  stdout, stderr = (
-    threading.Thread(target=handle, args=(process.stdout, stop_event)),
-    threading.Thread(target=handle, args=(process.stderr, stop_event)),
-  )
-  stdout.start()
-  stderr.start()  # noqa: E702
+  stdout, stderr = threading.Thread(target=handle, args=(process.stdout, stop_event)), threading.Thread(target=handle, args=(process.stderr, stop_event))
+  stdout.start(); stderr.start()  # noqa: E702
 
   try:
     process.wait()
@@ -571,12 +507,9 @@ def run_server(args, env, return_process=False) -> subprocess.Popen[bytes] | int
     raise
   finally:
     stop_event.set()
-    stdout.join()
-    stderr.join()  # noqa: E702
-    if process.poll() is not None:
-      process.kill()
-    stdout.join()
-    stderr.join()  # noqa: E702
+    stdout.join(); stderr.join()  # noqa: E702
+    if process.poll() is not None: process.kill()
+    stdout.join(); stderr.join()  # noqa: E702
 
   return process.returncode
 
@@ -664,10 +597,7 @@ def import_command(
     backend=backend,
     dtype=dtype,
     serialisation=t.cast(
-      LiteralSerialisation,
-      first_not_none(
-        serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-      ),
+      LiteralSerialisation, first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy')
     ),
   )
   backend_warning(llm.__llm_backend__)
@@ -726,21 +656,14 @@ class BuildBentoOutput(t.TypedDict):
   metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model]',
   help='Deprecated. Use positional argument instead.',
 )
-@click.option(
-  '--bento-version',
-  type=str,
-  default=None,
-  help='Optional bento version for this BentoLLM. Default is the the model revision.',
-)
+@click.option('--bento-version', type=str, default=None, help='Optional bento version for this BentoLLM. Default is the the model revision.')
 @click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
 @click.option(
   '--enable-features',
   multiple=True,
   nargs=1,
   metavar='FEATURE[,FEATURE]',
-  help='Enable additional features for building this LLM Bento. Available: {}'.format(
-    ', '.join(OPTIONAL_DEPENDENCIES)
-  ),
+  help='Enable additional features for building this LLM Bento. Available: {}'.format(', '.join(OPTIONAL_DEPENDENCIES)),
 )
 @optimization_decorator
 @click.option(
@@ -751,12 +674,7 @@ class BuildBentoOutput(t.TypedDict):
   help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.",
 )
 @click.option('--build-ctx', help='Build context. This is required if --adapter-id uses relative path', default=None)
-@click.option(
-  '--dockerfile-template',
-  default=None,
-  type=click.File(),
-  help='Optional custom dockerfile template to be used with this BentoLLM.',
-)
+@click.option('--dockerfile-template', default=None, type=click.File(), help='Optional custom dockerfile template to be used with this BentoLLM.')
 @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name='Utilities options')  # type: ignore[misc]
 @cog.optgroup.option(
   '--containerize',
@@ -849,9 +767,7 @@ def build_command(
   state = ItemState.NOT_FOUND
 
   if backend == 'pt':
-    logger.warning(
-      "PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead."
-    )
+    logger.warning("PyTorch backend is deprecated and will be removed from the next releases. Will set default backend to 'vllm' instead.")
 
   llm = openllm.LLM(
     model_id=model_id,
@@ -861,9 +777,7 @@ def build_command(
     dtype=dtype,
     max_model_len=max_model_len,
     gpu_memory_utilization=gpu_memory_utilization,
-    serialisation=first_not_none(
-      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-    ),
+    serialisation=first_not_none(serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'),
     _eager=False,
   )
   if llm.__llm_backend__ not in llm.config['backend']:
@@ -875,9 +789,7 @@ def build_command(
     model = openllm.serialisation.import_model(llm, trust_remote_code=llm.trust_remote_code)
   llm._tag = model.tag
 
-  os.environ.update(
-    **process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm)
-  )
+  os.environ.update(**process_environ(llm.config, llm.config['timeout'], 1.0, None, True, llm.model_id, None, llm._serialisation, llm))
 
   try:
     assert llm.bentomodel  # HACK: call it here to patch correct tag with revision and everything
@@ -944,11 +856,7 @@ def build_command(
 
   def get_current_bentocloud_context() -> str | None:
     try:
-      context = (
-        cloud_config.get_context(ctx.obj.cloud_context)
-        if ctx.obj.cloud_context
-        else cloud_config.get_current_context()
-      )
+      context = cloud_config.get_context(ctx.obj.cloud_context) if ctx.obj.cloud_context else cloud_config.get_current_context()
       return context.name
     except Exception:
       return None
@@ -972,9 +880,7 @@ def build_command(
     tag=str(bento_tag),
     backend=llm.__llm_backend__,
     instructions=[
-      DeploymentInstruction.from_content(
-        type='bentocloud', instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}", cmd=push_cmd
-      ),
+      DeploymentInstruction.from_content(type='bentocloud', instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}", cmd=push_cmd),
       DeploymentInstruction.from_content(
         type='container',
         instr="🐳 Container BentoLLM with 'bentoml containerize':\n    $ {cmd}",
@@ -1000,9 +906,7 @@ def build_command(
         termui.echo(f"  * {instruction['content']}\n", nl=False)
 
   if push:
-    BentoMLContainer.bentocloud_client.get().push_bento(
-      bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push
-    )
+    BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push)
   elif containerize:
     container_backend = t.cast('DefaultBuilder', os.environ.get('BENTOML_CONTAINERIZE_BACKEND', 'docker'))
     try:
@@ -1042,8 +946,7 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
       architecture=config.__openllm_architecture__,
       example_id=random.choice(config.__openllm_model_ids__),
       supported_backends=config.__openllm_backend__,
-      installation='pip install '
-      + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
+      installation='pip install ' + (f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config.__openllm_requirements__ else 'openllm'),
       items=[
         str(md.tag)
         for md in bentoml.models.list()
@@ -1062,13 +965,7 @@ def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
 @cli.command()
 @model_name_argument(required=False)
 @click.option('-y', '--yes', '--assume-yes', is_flag=True, help='Skip confirmation when deleting a specific model')
-@click.option(
-  '--include-bentos/--no-include-bentos',
-  is_flag=True,
-  hidden=True,
-  default=True,
-  help='Whether to also include pruning bentos.',
-)
+@click.option('--include-bentos/--no-include-bentos', is_flag=True, hidden=True, default=True, help='Whether to also include pruning bentos.')
 @inject
 @click.pass_context
 def prune_command(
@@ -1085,32 +982,24 @@ def prune_command(
   If a model type is passed, then only prune models for that given model type.
   """
   available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
-    (m, model_store)
-    for m in bentoml.models.list()
-    if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
+    (m, model_store) for m in bentoml.models.list() if 'framework' in m.info.labels and m.info.labels['framework'] == 'openllm'
   ]
   if model_name is not None:
     available = [
-      (m, store)
-      for m, store in available
-      if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
+      (m, store) for m, store in available if 'model_name' in m.info.labels and m.info.labels['model_name'] == inflection.underscore(model_name)
     ] + [
       (b, bento_store)
       for b in bentoml.bentos.list()
       if 'start_name' in b.info.labels and b.info.labels['start_name'] == inflection.underscore(model_name)
     ]
   else:
-    available += [
-      (b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels
-    ]
+    available += [(b, bento_store) for b in bentoml.bentos.list() if '_type' in b.info.labels and '_framework' in b.info.labels]
 
   for store_item, store in available:
     if yes:
       delete_confirmed = True
     else:
-      delete_confirmed = click.confirm(
-        f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?"
-      )
+      delete_confirmed = click.confirm(f"delete {'model' if isinstance(store, ModelStore) else 'bento'} {store_item.tag}?")
     if delete_confirmed:
       store.delete(store_item.tag)
       termui.warning(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.")
@@ -1157,17 +1046,8 @@ def shared_client_options(f: _AnyCallable | None = None) -> t.Callable[[FC], FC]
 
 @cli.command()
 @shared_client_options
-@click.option(
-  '--server-type',
-  type=click.Choice(['grpc', 'http']),
-  help='Server type',
-  default='http',
-  show_default=True,
-  hidden=True,
-)
-@click.option(
-  '--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.'
-)
+@click.option('--server-type', type=click.Choice(['grpc', 'http']), help='Server type', default='http', show_default=True, hidden=True)
+@click.option('--stream/--no-stream', type=click.BOOL, is_flag=True, default=True, help='Whether to stream the response.')
 @click.argument('prompt', type=click.STRING)
 @click.option(
   '--sampling-params',
diff --git a/openllm-python/src/openllm_cli/extension/dive_bentos.py b/openllm-python/src/openllm_cli/extension/dive_bentos.py
index 541d07bf..db488004 100644
--- a/openllm-python/src/openllm_cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm_cli/extension/dive_bentos.py
@@ -21,9 +21,7 @@ if t.TYPE_CHECKING:
 @machine_option
 @click.pass_context
 @inject
-def cli(
-  ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
-) -> str | None:
+def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
   """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
   try:
     bentomodel = _bento_store.get(bento)
diff --git a/openllm-python/src/openllm_cli/extension/get_containerfile.py b/openllm-python/src/openllm_cli/extension/get_containerfile.py
index 50798829..88605414 100644
--- a/openllm-python/src/openllm_cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm_cli/extension/get_containerfile.py
@@ -17,9 +17,7 @@ if t.TYPE_CHECKING:
   from bentoml._internal.bento import BentoStore
 
 
-@click.command(
-  'get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.'
-)
+@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
 @click.argument('bento', type=str, shell_complete=bento_complete_envvar)
 @click.pass_context
 @inject
diff --git a/openllm-python/src/openllm_cli/extension/get_prompt.py b/openllm-python/src/openllm_cli/extension/get_prompt.py
index 0e64c230..b679577f 100644
--- a/openllm-python/src/openllm_cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm_cli/extension/get_prompt.py
@@ -22,9 +22,7 @@ class PromptFormatter(string.Formatter):
       raise ValueError('Positional arguments are not supported')
     return super().vformat(format_string, args, kwargs)
 
-  def check_unused_args(
-    self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]
-  ) -> None:
+  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
     extras = set(kwargs).difference(used_args)
     if extras:
       raise KeyError(f'Extra params passed: {extras}')
@@ -58,9 +56,7 @@ class PromptTemplate:
     try:
       return self.template.format(**prompt_variables)
     except KeyError as e:
-      raise RuntimeError(
-        f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template."
-      ) from None
+      raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template.") from None
 
 
 @click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@@ -128,21 +124,15 @@ def cli(
   if prompt_template_file and chat_template_file:
     ctx.fail('prompt-template-file and chat-template-file are mutually exclusive.')
 
-  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(
-    inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys()
-  )
+  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys())
   if model_id in acceptable:
-    logger.warning(
-      'Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n'
-    )
+    logger.warning('Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n')
     config = openllm.AutoConfig.for_model(model_id)
     template = prompt_template_file.read() if prompt_template_file is not None else config.template
     system_message = system_message or config.system_message
 
     try:
-      formatted = (
-        PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
-      )
+      formatted = PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
     except RuntimeError as err:
       logger.debug('Exception caught while formatting prompt: %s', err)
       ctx.fail(str(err))
@@ -159,21 +149,15 @@ def cli(
       for architecture in config.architectures:
         if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
           system_message = (
-            openllm.AutoConfig.infer_class_from_name(
-              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
-            )
+            openllm.AutoConfig.infer_class_from_name(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
             .model_construct_env()
             .system_message
           )
           break
       else:
-        ctx.fail(
-          f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message'
-        )
+        ctx.fail(f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message')
     messages = [{'role': 'system', 'content': system_message}, {'role': 'user', 'content': prompt}]
-    formatted = tokenizer.apply_chat_template(
-      messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False
-    )
+    formatted = tokenizer.apply_chat_template(messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False)
 
   termui.echo(orjson.dumps({'prompt': formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
   ctx.exit(0)
diff --git a/openllm-python/src/openllm_cli/extension/list_models.py b/openllm-python/src/openllm_cli/extension/list_models.py
index 6eb49e07..eb18ce0d 100644
--- a/openllm-python/src/openllm_cli/extension/list_models.py
+++ b/openllm-python/src/openllm_cli/extension/list_models.py
@@ -33,17 +33,12 @@ def cli(model_name: str | None) -> DictStrAny:
   }
   if model_name is not None:
     ids_in_local_store = {
-      k: [
-        i
-        for i in v
-        if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
-      ]
+      k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
       for k, v in ids_in_local_store.items()
     }
   ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
   local_models = {
-    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val]
-    for k, val in ids_in_local_store.items()
+    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()
   }
   termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
   return local_models
diff --git a/openllm-python/src/openllm_cli/extension/playground.py b/openllm-python/src/openllm_cli/extension/playground.py
index fcbc128b..f8e5b4da 100644
--- a/openllm-python/src/openllm_cli/extension/playground.py
+++ b/openllm-python/src/openllm_cli/extension/playground.py
@@ -32,14 +32,7 @@ def load_notebook_metadata() -> DictStrAny:
 
 @click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
 @click.argument('output-dir', default=None, required=False)
-@click.option(
-  '--port',
-  envvar='JUPYTER_PORT',
-  show_envvar=True,
-  show_default=True,
-  default=8888,
-  help='Default port for Jupyter server',
-)
+@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
 @click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   """OpenLLM Playground.
@@ -60,9 +53,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
   """
   if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError(
-      "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
-    )
+    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
   metadata = load_notebook_metadata()
   _temp_dir = False
   if output_dir is None:
@@ -74,9 +65,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
   for module in pkgutil.iter_modules(playground.__path__):
     if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug(
-        'Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module'
-      )
+      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
       continue
     if not isinstance(module.module_finder, importlib.machinery.FileFinder):
       continue
diff --git a/openllm-python/src/openllm_cli/termui.py b/openllm-python/src/openllm_cli/termui.py
index dfe4f8fa..c442d3be 100644
--- a/openllm-python/src/openllm_cli/termui.py
+++ b/openllm-python/src/openllm_cli/termui.py
@@ -25,14 +25,7 @@ class Level(enum.IntEnum):
 
   @property
   def color(self) -> str | None:
-    return {
-      Level.NOTSET: None,
-      Level.DEBUG: 'cyan',
-      Level.INFO: 'green',
-      Level.WARNING: 'yellow',
-      Level.ERROR: 'red',
-      Level.CRITICAL: 'red',
-    }[self]
+    return {Level.NOTSET: None, Level.DEBUG: 'cyan', Level.INFO: 'green', Level.WARNING: 'yellow', Level.ERROR: 'red', Level.CRITICAL: 'red'}[self]
 
   @classmethod
   def from_logging_level(cls, level: int) -> Level:
@@ -82,9 +75,5 @@ def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json:
 
 
 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {
-  'help_option_names': ['-h', '--help'],
-  'max_content_width': COLUMNS,
-  'token_normalize_func': inflection.underscore,
-}
-__all__ = ['COLUMNS', 'CONTEXT_SETTINGS', 'Level', 'critical', 'debug', 'echo', 'error', 'info', 'log', 'warning']
+CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
+__all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS', 'log', 'warning', 'error', 'critical', 'debug', 'info', 'Level']
diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py
index fafa4098..90069b79 100644
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -66,15 +66,8 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
   st.integers(max_value=283473),
   st.floats(min_value=0.0, max_value=1.0),
 )
-def test_complex_struct_dump(
-  gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float
-):
-  cl_ = make_llm_config(
-    'ComplexLLM',
-    gen_settings,
-    fields=(('field1', 'float', field1),),
-    generation_fields=(('temperature', temperature),),
-  )
+def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
+  cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),))
   sent = cl_()
   assert sent.model_dump()['field1'] == field1
   assert sent.model_dump()['generation_config']['temperature'] == temperature
diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py
index e49b2656..1efd9e4d 100644
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -10,14 +10,8 @@ import openllm
 if t.TYPE_CHECKING:
   from openllm_core._typing_compat import LiteralBackend
 
-_MODELING_MAPPING = {
-  'flan_t5': 'google/flan-t5-small',
-  'opt': 'facebook/opt-125m',
-  'baichuan': 'baichuan-inc/Baichuan-7B',
-}
-_PROMPT_MAPPING = {
-  'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?'
-}
+_MODELING_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B'}
+_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?'}
 
 
 def parametrise_local_llm(model: str) -> t.Generator[tuple[str, openllm.LLM[t.Any, t.Any]], None, None]:
@@ -31,9 +25,7 @@ def parametrise_local_llm(model: str) -> t.Generator[tuple[str, openllm.LLM[t.An
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
   if os.getenv('GITHUB_ACTIONS') is None:
     if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
-      metafunc.parametrize(
-        'prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])]
-      )
+      metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py
index 6b95ac0d..f801ed81 100644
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -73,13 +73,9 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
     mcls.setenv('CUDA_VISIBLE_DEVICES', '')
     assert len(NvidiaGpuResource.from_system()) >= 0  # TODO: real from_system tests
 
-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1]).match(
-      'Input list should be all string type.'
-    )
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1]).match('Input list should be all string type.')
     assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match(
-      'Failed to parse available GPUs UUID'
-    )
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
 
 
 def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):