diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2ff88fcd..2dd6f54e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -91,36 +91,36 @@ repos:
       - id: check-added-large-files
       - id: debug-statements
       - id: check-merge-conflict
-  - repo: https://github.com/RobertCraigie/pyright-python
-    rev: v1.1.324
-    hooks:
-    - id: pyright
-      verbose: true
-      args: [--level, error]
-      exclude: |
-        (?x)^(
-            examples/.*|
-            tools/.*|
-            tests/.*|
-            openllm-python/src/openllm/playground/.*|
-            openllm-python/tests/.*|
-            openllm-client/src/openllm_client/pb.*|
-            .github/.*|
-            cz.py |
-            hatch_build.py
-        )$
-      additional_dependencies:
-        - openllm-client[grpc]
-        - bentoml[io]>=1.1.2
-        - transformers[agents,torch,tokenizers,accelerate]>=4.29.0
-        - peft
-        - safetensors
-        - optimum
-        - ghapi
-        - click==8.1.3
-        - bitsandbytes
-        - diffusers
-        - soundfile
+  # - repo: https://github.com/RobertCraigie/pyright-python
+  #   rev: v1.1.324
+  #   hooks:
+  #   - id: pyright
+  #     verbose: true
+  #     args: [--level, error]
+  #     exclude: |
+  #       (?x)^(
+  #           examples/.*|
+  #           tools/.*|
+  #           tests/.*|
+  #           openllm-python/src/openllm/playground/.*|
+  #           openllm-python/tests/.*|
+  #           openllm-client/src/openllm_client/pb.*|
+  #           .github/.*|
+  #           cz.py |
+  #           hatch_build.py
+  #       )$
+  #     additional_dependencies:
+  #       - openllm-client[grpc]
+  #       - bentoml[io]>=1.1.2
+  #       - transformers[agents,torch,tokenizers,accelerate]>=4.29.0
+  #       - peft
+  #       - safetensors
+  #       - optimum
+  #       - ghapi
+  #       - click==8.1.3
+  #       - bitsandbytes
+  #       - diffusers
+  #       - soundfile
   - repo: meta
     hooks:
       - id: check-hooks-apply
diff --git a/cz.py b/cz.py
index f84c3613..53bed816 100755
--- a/cz.py
+++ b/cz.py
@@ -7,6 +7,7 @@ import tokenize
 
 from tabulate import tabulate
 TOKEN_WHITELIST = [token.OP, token.NAME, token.NUMBER, token.STRING]
+
 def run_cz(dir: str, package: str):
   headers = ['Name', 'Lines', 'Tokens/Line']
   table = []
@@ -22,9 +23,11 @@ def run_cz(dir: str, package: str):
   for dir_name, group in itertools.groupby(sorted([(x[0].rsplit('/', 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
     print(f'{dir_name:35s} : {sum([x[1] for x in group]):6d}')
   print(f'\ntotal line count: {sum([x[1] for x in table])}')
+
 def main() -> int:
   run_cz('openllm-python', 'openllm')
   run_cz('openllm-core', 'openllm_core')
   run_cz('openllm-client', 'openllm_client')
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/examples/bentoml-demo/service.py b/examples/bentoml-demo/service.py
index ac78fb80..9118939a 100644
--- a/examples/bentoml-demo/service.py
+++ b/examples/bentoml-demo/service.py
@@ -8,9 +8,11 @@ llm_config = openllm.AutoConfig.for_model(model)
 llm_runner = openllm.Runner(model, llm_config=llm_config)
 
 svc = bentoml.Service(name="llm-service", runners=[llm_runner])
+
 @svc.on_startup
 def download(_: bentoml.Context):
   llm_runner.download_model()
+
 @svc.api(input=bentoml.io.Text(), output=bentoml.io.Text())
 async def prompt(input_text: str) -> str:
   answer = await llm_runner.generate.async_run(input_text)
diff --git a/examples/langchain-chains-demo/service.py b/examples/langchain-chains-demo/service.py
index 0c1a2dcf..3860cb01 100644
--- a/examples/langchain-chains-demo/service.py
+++ b/examples/langchain-chains-demo/service.py
@@ -8,15 +8,18 @@ from pydantic import BaseModel
 
 import bentoml
 from bentoml.io import JSON, Text
+
 class Query(BaseModel):
   industry: str
   product_name: str
   keywords: t.List[str]
   llm_config: t.Dict[str, t.Any]
+
 def gen_llm(model_name: str, model_id: str | None = None) -> OpenLLM:
   lc_llm = OpenLLM(model_name=model_name, model_id=model_id, embedded=False)
   lc_llm.runner.download_model()
   return lc_llm
+
 llm = gen_llm("dolly-v2", model_id="databricks/dolly-v2-7b")
 
 prompt = PromptTemplate(
@@ -38,12 +41,15 @@ Facebook Ads copy:
 chain = LLMChain(llm=llm, prompt=prompt)
 
 svc = bentoml.Service("fb-ads-copy", runners=[llm.runner])
+
 @svc.on_startup
 def download(_: bentoml.Context):
   llm.runner.download_model()
+
 SAMPLE_INPUT = Query(
     industry="SAAS", product_name="BentoML", keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"], llm_config=llm.runner.config.model_dump(),
 )
+
 @svc.api(input=JSON.from_sample(sample=SAMPLE_INPUT), output=Text())
 def generate(query: Query):
   return chain.run({"industry": query.industry, "product_name": query.product_name, "keywords": ", ".join(query.keywords)})
diff --git a/examples/langchain-tools-demo/service.py b/examples/langchain-tools-demo/service.py
index 51685533..da5d261a 100644
--- a/examples/langchain-tools-demo/service.py
+++ b/examples/langchain-tools-demo/service.py
@@ -11,6 +11,7 @@ llm = OpenLLM(model_name="dolly-v2", model_id="databricks/dolly-v2-7b", embedded
 tools = load_tools(["serpapi"], llm=llm)
 agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)
 svc = bentoml.Service("langchain-openllm", runners=[llm.runner])
+
 @svc.api(input=Text.from_sample(sample=SAMPLE_INPUT), output=Text())
 def chat(input_text: str):
   return agent.run(input_text)
diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py
index 99950906..1709d171 100644
--- a/openllm-client/src/openllm_client/_base.py
+++ b/openllm-client/src/openllm_client/_base.py
@@ -21,6 +21,7 @@ if t.TYPE_CHECKING:
 
   from openllm_core._typing_compat import DictStrAny, LiteralRuntime
 logger = logging.getLogger(__name__)
+
 @attr.define(slots=False, init=False)
 class _ClientAttr:
   _address: str
@@ -145,6 +146,7 @@ class _ClientAttr:
   @functools.cached_property
   def inner(self) -> t.Any:
     raise NotImplementedError("'inner' client is not implemented.")
+
 class _Client(_ClientAttr):
   _host: str
   _port: str
@@ -175,6 +177,7 @@ class _Client(_ClientAttr):
     except Exception as err:
       logger.error('Exception caught while sending instruction to HF agent: %s', err, exc_info=err)
       logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
+
 class _AsyncClient(_ClientAttr):
   _host: str
   _port: str
@@ -230,6 +233,7 @@ class _AsyncClient(_ClientAttr):
     else:
       tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote)
       return f'{tool_code}\n{code}'
+
 class BaseClient(_Client):
   def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str:
     raise NotImplementedError
@@ -255,6 +259,7 @@ class BaseClient(_Client):
     if return_response == 'attrs': return r
     elif return_response == 'raw': return bentoml_cattr.unstructure(r)
     else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
+
 class BaseAsyncClient(_AsyncClient):
   async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str:
     raise NotImplementedError
diff --git a/openllm-client/src/openllm_client/benmin/__init__.py b/openllm-client/src/openllm_client/benmin/__init__.py
index 9ea1f15b..39a233b9 100644
--- a/openllm-client/src/openllm_client/benmin/__init__.py
+++ b/openllm-client/src/openllm_client/benmin/__init__.py
@@ -22,6 +22,7 @@ import bentoml
 if t.TYPE_CHECKING: from bentoml._internal.service.inference_api import InferenceAPI
 
 __all__ = ['Client', 'AsyncClient']
+
 @attr.define(init=False)
 class Client:
   server_url: str
@@ -67,6 +68,7 @@ class Client:
       return GrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
     except Exception as err:
       raise bentoml.exceptions.BentoMLException('Failed to wait until server ready: %s:%d' % (host, port)) from err
+
 @attr.define(init=False)
 class AsyncClient:
   server_url: str
diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py
index f1aca062..22dbbfb1 100644
--- a/openllm-client/src/openllm_client/benmin/_grpc.py
+++ b/openllm-client/src/openllm_client/benmin/_grpc.py
@@ -22,10 +22,12 @@ pb, services = import_generated_stubs('v1')
 if t.TYPE_CHECKING:
   from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse
 logger = logging.getLogger(__name__)
+
 class ClientCredentials(t.TypedDict):
   root_certificates: NotRequired[t.Union[bytes, str]]
   private_key: NotRequired[t.Union[bytes, str]]
   certificate_chain: NotRequired[t.Union[bytes, str]]
+
 @overload
 def dispatch_channel(
     server_url: str,
@@ -37,6 +39,7 @@ def dispatch_channel(
     interceptors: t.Sequence[aio.ClientInterceptor] | None = ...
 ) -> aio.Channel:
   ...
+
 @overload
 def dispatch_channel(
     server_url: str,
@@ -48,6 +51,7 @@ def dispatch_channel(
     interceptors: t.Sequence[aio.ClientInterceptor] | None = None
 ) -> grpc.Channel:
   ...
+
 def dispatch_channel(
     server_url: str,
     typ: t.Literal['async', 'sync'] = 'sync',
@@ -67,6 +71,7 @@ def dispatch_channel(
   elif typ == 'sync' and ssl: return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression)
   elif typ == 'sync': return grpc.insecure_channel(server_url, options=options, compression=compression)
   else: raise ValueError(f'Unknown type: {typ}')
+
 class GrpcClient(Client):
   ssl: bool
   ssl_client_credentials: t.Optional[ClientCredentials]
@@ -172,6 +177,7 @@ class GrpcClient(Client):
     stubs = services.BentoServiceStub(self.inner)
     proto = stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
     return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content'))))
+
 class AsyncGrpcClient(AsyncClient):
   ssl: bool
   ssl_client_credentials: t.Optional[ClientCredentials]
diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py
index 11772f54..ea3dff48 100644
--- a/openllm-client/src/openllm_client/benmin/_http.py
+++ b/openllm-client/src/openllm_client/benmin/_http.py
@@ -18,6 +18,7 @@ from bentoml._internal.service.inference_api import InferenceAPI
 from openllm_client.benmin import AsyncClient, Client
 from openllm_core.utils import ensure_exec_coro
 logger = logging.getLogger(__name__)
+
 class HttpClient(Client):
   @functools.cached_property
   def inner(self) -> httpx.Client:
@@ -102,6 +103,7 @@ class HttpClient(Client):
     # Request.headers sets a _headers variable. We will need to set this value to our fake request object.
     fake_req._headers = headers
     return ensure_exec_coro(_inference_api.output.from_http_request(fake_req))
+
 class AsyncHttpClient(AsyncClient):
   @functools.cached_property
   def inner(self) -> httpx.AsyncClient:
diff --git a/openllm-client/src/openllm_client/client.py b/openllm-client/src/openllm_client/client.py
index ceafef7e..4cd784d6 100644
--- a/openllm-client/src/openllm_client/client.py
+++ b/openllm-client/src/openllm_client/client.py
@@ -4,24 +4,29 @@ from urllib.parse import urlparse
 
 from ._base import BaseAsyncClient, BaseClient
 logger = logging.getLogger(__name__)
+
 def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
   address = address if '://' in address else 'http://' + address
   parsed = urlparse(address)
   self._host, *_port = parsed.netloc.split(':')
   if len(_port) == 0: self._port = '80' if parsed.scheme == 'http' else '443'
   else: self._port = next(iter(_port))
+
 class HTTPClient(BaseClient):
   def __init__(self, address: str, timeout: int = 30):
     process_http_address(self, address)
     super().__init__(address, timeout)
+
 class AsyncHTTPClient(BaseAsyncClient):
   def __init__(self, address: str, timeout: int = 30):
     process_http_address(self, address)
     super().__init__(address, timeout)
+
 class GrpcClient(BaseClient):
   def __init__(self, address: str, timeout: int = 30):
     self._host, self._port = address.split(':')
     super().__init__(address, timeout)
+
 class AsyncGrpcClient(BaseAsyncClient):
   def __init__(self, address: str, timeout: int = 30):
     self._host, self._port = address.split(':')
diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
index 54bcbc95..9ad2464c 100644
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -76,11 +76,13 @@ __all__ = ['LLMConfig', 'GenerationConfig', 'SamplingParams', 'field_env_key']
 
 logger = logging.getLogger(__name__)
 config_merger = Merger([(dict, 'merge')], ['override'], ['override'])
+
 # case insensitive, but rename to conform with type
 class _PeftEnumMeta(enum.EnumMeta):
   def __getitem__(self, __key: str | t.Any, /) -> t.Any:
     if isinstance(__key, str): __key = inflection.underscore(__key).upper()
     return self._member_map_[__key]
+
 # vendorred from peft.utils.config.PeftType since we don't have hard dependency on peft
 # see https://github.com/huggingface/peft/blob/main/src/peft/utils/config.py
 class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta):
@@ -109,14 +111,17 @@ class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta):
   @staticmethod
   def get(__key: str | t.Any, /) -> PeftType:
     return PeftType[__key]  # type-safe getitem.
+
 _PEFT_TASK_TYPE_TARGET_MAPPING = {'causal_lm': 'CAUSAL_LM', 'seq2seq_lm': 'SEQ_2_SEQ_LM'}
 
 _object_setattr = object.__setattr__
+
 def _adapter_converter(value: AdapterType | str | PeftType | None) -> PeftType:
   if value is None: raise ValueError("'AdapterType' cannot be None.")
   if isinstance(value, PeftType): return value
   if value not in PeftType.supported(): raise ValueError(f"Given '{value}' is not a supported adapter type.")
   return PeftType.get(value)
+
 @attr.define(slots=True, init=True)
 class FineTuneConfig:
   '''FineTuneConfig defines a default value for fine-tuning this any given LLM.
@@ -193,6 +198,7 @@ class FineTuneConfig:
     adapter_type, inference_mode = attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode)
     if 'llm_config_class' in attrs: raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.")
     return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs))
+
 @attr.frozen(slots=True, repr=False, init=False)
 class GenerationConfig(ReprMixin):
   '''GenerationConfig is the attrs-compatible version of ``transformers.GenerationConfig``, with some additional validation and environment constructor.
@@ -317,6 +323,7 @@ class GenerationConfig(ReprMixin):
   @property
   def __repr_keys__(self) -> set[str]:
     return {i.name for i in attr.fields(self.__class__)}
+
 bentoml_cattr.register_unstructure_hook_factory(
     lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig),
     lambda cls: make_dict_unstructure_fn(
@@ -329,6 +336,7 @@ bentoml_cattr.register_unstructure_hook_factory(
         }
     )
 )
+
 @attr.frozen(slots=True, repr=False, init=False)
 class SamplingParams(ReprMixin):
   '''SamplingParams is the attr-compatible version of ``vllm.SamplingParams``. It provides some utilities to also respect shared variables from ``openllm.LLMConfig``.
@@ -398,6 +406,7 @@ class SamplingParams(ReprMixin):
     top_p = first_not_none(attrs.pop('top_p', None), default=generation_config['top_p'])
     max_tokens = first_not_none(attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens'])
     return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs)
+
 bentoml_cattr.register_unstructure_hook_factory(
     lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
     lambda cls: make_dict_unstructure_fn(
@@ -417,6 +426,7 @@ bentoml_cattr.register_structure_hook_factory(
 
 # cached it here to save one lookup per assignment
 _object_getattribute = object.__getattribute__
+
 class ModelSettings(t.TypedDict, total=False):
   '''ModelSettings serve only for typing purposes as this is transcribed into LLMConfig.__config__.
 
@@ -461,7 +471,9 @@ class ModelSettings(t.TypedDict, total=False):
 
   # tokenizer_class is the custom tokenizer class for this given LLM
   tokenizer_class: t.Optional[str]
+
 _transformed_type: DictStrAny = {'fine_tune_strategies': t.Dict[AdapterType, FineTuneConfig], 'default_implementation': t.Dict[LiteralResourceSpec, LiteralRuntime]}
+
 @attr.define(
     frozen=False,
     slots=True,
@@ -539,6 +551,7 @@ class _ModelSettingsAttr:
     fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig]
     tokenizer_class: t.Optional[str]
     # update-config-stubs.py: attrs stop
+
 # a heuristic cascading implementation resolver based on available resources
 def get_default_implementation(default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime:
   available_spec = available_resource_spec()
@@ -546,6 +559,7 @@ def get_default_implementation(default_implementation_mapping: dict[LiteralResou
   elif resource_spec('amd') in available_spec: return default_implementation_mapping.get(resource_spec('amd'), 'pt')
   elif resource_spec('nvidia') in available_spec: return default_implementation_mapping.get(resource_spec('nvidia'), 'pt')
   else: return default_implementation_mapping.get(resource_spec('cpu'), 'pt')
+
 def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr:
   if 'generation_class' in cl_.__config__:
     raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.")
@@ -591,9 +605,12 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
       _converted[_adapter_type] = FineTuneConfig(PeftType[_adapter_type], _possible_ft_config, False, _llm_config_class)
   _final_value_dct['fine_tune_strategies'] = _converted
   return attr.evolve(_settings_attr, **_final_value_dct)
+
 bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings)
+
 def _setattr_class(attr_name: str, value_var: t.Any) -> str:
   return f"setattr(cls, '{attr_name}', {value_var})"
+
 def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: LiteralString = 'openllm') -> t.Callable[..., None]:
   '''Generate the assignment script with prefix attributes __openllm_<value>__.'''
   args: ListStr = []
@@ -608,7 +625,9 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance
     annotations[attr_name] = field.type
 
   return codegen.generate_function(cls, '__assign_attr', lines, args=('cls', *args), globs=globs, annotations=annotations)
+
 _reserved_namespace = {'__config__', 'GenerationConfig', 'SamplingParams'}
+
 @attr.define(slots=True)
 class _ConfigAttr:
   @staticmethod
@@ -760,6 +779,7 @@ class _ConfigAttr:
     '''The fine-tune strategies for this given LLM.'''
     __openllm_tokenizer_class__: t.Optional[str] = Field(None)
     '''Optional tokenizer class for this given LLM. See Llama for example.'''
+
     # update-config-stubs.py: special stop
 class _ConfigBuilder:
   """A modified version of attrs internal _ClassBuilder, and should only be called within __init_subclass__ of LLMConfig.
@@ -873,6 +893,7 @@ class _ConfigBuilder:
       if key in ('__repr__', '__str__', '__repr_name__', '__repr_str__', '__repr_args__'): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
     self._cls_dict['__repr_keys__'] = property(lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'})
     return self
+
 @attr.define(slots=True, init=False)
 class LLMConfig(_ConfigAttr):
   """``openllm.LLMConfig`` is a pydantic-like ``attrs`` interface that offers fast and easy-to-use APIs.
@@ -1474,9 +1495,11 @@ class LLMConfig(_ConfigAttr):
     `openllm.LLM` also has a postprocess_generate that will just call this method.
     '''
     return generation_result
+
 bentoml_cattr.register_unstructure_hook_factory(
     lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True)
 )
+
 def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
   """Structure a dictionary to a LLMConfig object.
 
@@ -1498,5 +1521,6 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
   # The rest should be passed to extras
   data = {k: v for k, v in data.items() if k not in cls.__openllm_accepted_keys__}
   return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs)
+
 bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
 openllm_home = os.path.expanduser(os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm')))
diff --git a/openllm-core/src/openllm_core/_prompt.py b/openllm-core/src/openllm_core/_prompt.py
index 14a193d2..89fe5987 100644
--- a/openllm-core/src/openllm_core/_prompt.py
+++ b/openllm-core/src/openllm_core/_prompt.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import string
 import typing as t
+
 class PromptFormatter(string.Formatter):
   """This PromptFormatter is largely based on langchain's implementation."""
   def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> t.Any:
@@ -13,7 +14,9 @@ class PromptFormatter(string.Formatter):
 
   def extract_template_variables(self, template: str) -> t.Sequence[str]:
     return [field[1] for field in self.parse(template) if field[1] is not None]
+
 default_formatter = PromptFormatter()
+
 def process_prompt(prompt: str, template: str | None = None, use_prompt_template: bool = True, **attrs: t.Any) -> str:
   # Currently, all default prompt will always have `instruction` key.
   if not use_prompt_template: return prompt
diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py
index 2a2c82fc..7ce934a0 100644
--- a/openllm-core/src/openllm_core/_schema.py
+++ b/openllm-core/src/openllm_core/_schema.py
@@ -10,6 +10,7 @@ from openllm_core._configuration import GenerationConfig, LLMConfig
 
 from .utils import bentoml_cattr
 if t.TYPE_CHECKING: import vllm
+
 @attr.frozen(slots=True)
 class GenerationInput:
   prompt: str
@@ -41,6 +42,7 @@ class GenerationInput:
             'adapter_name': attr.field(default=None, type=str)
         }
     )
+
 @attr.frozen(slots=True)
 class GenerationOutput:
   responses: t.List[t.Any]
@@ -58,6 +60,7 @@ class GenerationOutput:
     if hasattr(self, key): return getattr(self, key)
     elif key in self.configuration: return self.configuration[key]
     else: raise KeyError(key)
+
 @attr.frozen(slots=True)
 class MetadataOutput:
   model_id: str
@@ -67,10 +70,12 @@ class MetadataOutput:
   configuration: str
   supports_embeddings: bool
   supports_hf_agent: bool
+
 @attr.frozen(slots=True)
 class EmbeddingsOutput:
   embeddings: t.List[t.List[float]]
   num_tokens: int
+
 def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.Any]:
   return dict(
       request_id=request_output.request_id,
@@ -82,6 +87,7 @@ def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.An
           for it in request_output.outputs
       ]
   )
+
 @attr.define
 class HfAgentInput:
   inputs: str
diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py
index 9f54d42a..461a6eb5 100644
--- a/openllm-core/src/openllm_core/_strategies.py
+++ b/openllm-core/src/openllm_core/_strategies.py
@@ -18,13 +18,16 @@ from bentoml._internal.runner.strategy import THREAD_ENVS
 
 from ._typing_compat import overload
 from .utils import DEBUG, ReprMixin
+
 class DynResource(t.Protocol):
   resource_id: t.ClassVar[str]
 
   @classmethod
   def from_system(cls) -> t.Sequence[t.Any]:
     ...
+
 logger = logging.getLogger(__name__)
+
 def _strtoul(s: str) -> int:
   '''Return -1 or positive integer sequence string starts with,.'''
   if not s: return -1
@@ -34,6 +37,7 @@ def _strtoul(s: str) -> int:
     if idx + 1 == len(s): idx += 1  # noqa: PLW2901
   # NOTE: idx will be set via enumerate
   return int(s[:idx]) if idx > 0 else -1
+
 def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
   rcs: list[str] = []
   for elem in lst.split(','):
@@ -43,16 +47,21 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
     if not elem.startswith(prefix): break
     rcs.append(elem)
   return rcs
+
 _STACK_LEVEL = 3
+
 @overload  # variant: default callback
 def _parse_visible_devices() -> list[str] | None:
   ...
+
 @overload  # variant: specify None, and respect_env
 def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None:
   ...
+
 @overload  # variant: default var is something other than None
 def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]:
   ...
+
 def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
   '''CUDA_VISIBLE_DEVICES aware with default var for parsing spec.'''
   if respect_env:
@@ -76,6 +85,7 @@ def _parse_visible_devices(default_var: str | None = None, respect_env: bool = T
     if x < 0: break
     rc.append(x)
   return [str(i) for i in rc]
+
 def _from_system(cls: type[DynResource]) -> list[str]:
   visible_devices = _parse_visible_devices()
   if visible_devices is None:
@@ -111,15 +121,19 @@ def _from_system(cls: type[DynResource]) -> list[str]:
       except (ImportError, RuntimeError, AttributeError):
         return []
   return visible_devices
+
 @overload
 def _from_spec(cls: type[DynResource], spec: int) -> list[str]:
   ...
+
 @overload
 def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]:
   ...
+
 @overload
 def _from_spec(cls: type[DynResource], spec: str) -> list[str]:
   ...
+
 def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
   if isinstance(spec, int):
     if spec in (-1, 0): return []
@@ -133,6 +147,7 @@ def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
     return [str(x) for x in spec]
   else:
     raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
+
 def _raw_device_uuid_nvml() -> list[str] | None:
   from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
 
@@ -167,6 +182,7 @@ def _raw_device_uuid_nvml() -> list[str] | None:
     uuids.append(buf.raw.decode('ascii').strip('\0'))
   del nvml_h
   return uuids
+
 def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
   if cls.resource_id == 'amd.com/gpu':
     raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
@@ -189,6 +205,7 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
         if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}')
   except (ImportError, RuntimeError):
     pass
+
 def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
   return types.new_class(
       name, (bentoml.Resource[t.List[str]], ReprMixin), {'resource_id': resource_kind},
@@ -202,6 +219,7 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
           '__module__': 'openllm._strategies'
       })
   )
+
 # NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm:
 _TPU_RESOURCE: t.Literal['cloud-tpus.google.com/v2'] = 'cloud-tpus.google.com/v2'
 _AMD_GPU_RESOURCE: t.Literal['amd.com/gpu'] = 'amd.com/gpu'
@@ -226,6 +244,7 @@ AmdGpuResource = _make_resource_class(
 )
 
 LiteralResourceSpec = t.Literal['cloud-tpus.google.com/v2', 'amd.com/gpu', 'nvidia.com/gpu', 'cpu']
+
 # convenient mapping
 def resource_spec(name: t.Literal['tpu', 'amd', 'nvidia', 'cpu']) -> LiteralResourceSpec:
   if name == 'tpu': return _TPU_RESOURCE
@@ -233,6 +252,7 @@ def resource_spec(name: t.Literal['tpu', 'amd', 'nvidia', 'cpu']) -> LiteralReso
   elif name == 'nvidia': return _NVIDIA_GPU_RESOURCE
   elif name == 'cpu': return _CPU_RESOURCE
   else: raise ValueError("Unknown alias. Accepted: ['tpu', 'amd', 'nvidia', 'cpu']")
+
 @functools.lru_cache
 def available_resource_spec() -> tuple[LiteralResourceSpec, ...]:
   '''This is a utility function helps to determine the available resources from given running system.
@@ -246,6 +266,7 @@ def available_resource_spec() -> tuple[LiteralResourceSpec, ...]:
   if len(NvidiaGpuResource.from_system()) > 0: available.append(_NVIDIA_GPU_RESOURCE)
   available.append(_CPU_RESOURCE)
   return tuple(available)
+
 class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
   """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource.
 
@@ -356,4 +377,5 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
       if idx >= len(gpus): raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
       dev = str(gpus[idx])
     return dev
+
 __all__ = ['CascadingResourceStrategy', 'get_resource']
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index 0a7b1d38..8572547b 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -49,22 +49,28 @@ if sys.version_info[:2] >= (3, 10):
   from typing import Concatenate as Concatenate, ParamSpec as ParamSpec, TypeAlias as TypeAlias
 else:
   from typing_extensions import Concatenate as Concatenate, ParamSpec as ParamSpec, TypeAlias as TypeAlias
+
 class PeftAdapterOutput(t.TypedDict):
   success: bool
   result: t.Dict[str, peft.PeftConfig]
   error_msg: str
+
 class LLMEmbeddings(t.TypedDict):
   embeddings: t.List[t.List[float]]
   num_tokens: int
+
 class AdaptersTuple(TupleAny):
   adapter_id: str
   name: t.Optional[str]
   config: DictStrAny
+
 AdaptersMapping = t.Dict[AdapterType, t.Tuple[AdaptersTuple, ...]]
+
 class RefTuple(TupleAny):
   git_hash: str
   version: VersionInfo
   strategy: LiteralContainerVersionStrategy
+
 class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
   SUPPORTED_RESOURCES = ('amd.com/gpu', 'nvidia.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
@@ -74,6 +80,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
   generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
   generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
+
 class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   __doc__: str
   __module__: str
diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
index b7e8a622..65839e26 100644
--- a/openllm-core/src/openllm_core/config/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -21,6 +21,7 @@ if t.TYPE_CHECKING:
 CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'), ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), (
     'llama', 'LlamaConfig'
 ), ('mpt', 'MPTConfig'), ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'), ('baichuan', 'BaichuanConfig')])
+
 class _LazyConfigMapping(OrderedDict, ReprMixin):
   def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]):
     self._mapping = mapping
@@ -66,9 +67,11 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
   def register(self, key: str, value: t.Any) -> None:
     if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.")
     self._extra_content[key] = value
+
 CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
 # The below handle special alias when we call underscore to the name directly without processing camelcase first.
 CONFIG_NAME_ALIASES: dict[str, str] = {'chat_glm': 'chatglm', 'stable_lm': 'stablelm', 'star_coder': 'starcoder', 'gpt_neo_x': 'gpt_neox',}
+
 class AutoConfig:
   def __init__(self, *_: t.Any, **__: t.Any):
     raise EnvironmentError('Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.')
diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py
index 5556d237..9bf03f56 100644
--- a/openllm-core/src/openllm_core/config/configuration_baichuan.py
+++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -23,6 +23,7 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
 $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
 '''
 DEFAULT_PROMPT_TEMPLATE = '''{instruction}'''
+
 class BaichuanConfig(openllm_core.LLMConfig):
   """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py
index 8cd7cb05..777609ed 100644
--- a/openllm-core/src/openllm_core/config/configuration_chatglm.py
+++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py
@@ -23,6 +23,7 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
 $ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
 '''
 DEFAULT_PROMPT_TEMPLATE = '''{instruction}'''
+
 class ChatGLMConfig(openllm_core.LLMConfig):
   """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
index 568cac9e..f5adf687 100644
--- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
+++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
@@ -37,6 +37,7 @@ DEFAULT_PROMPT_TEMPLATE = '''{intro}
 {instruction}
 {response_key}
 '''.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction='{instruction}', response_key=RESPONSE_KEY)
+
 def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) -> int:
   '''Gets the token ID for a given string that has been added to the tokenizer as a special token.
 
@@ -56,6 +57,7 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
   token_ids = tokenizer.encode(key)
   if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
   return token_ids[0]
+
 class DollyV2Config(openllm_core.LLMConfig):
   """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py
index 95be5452..49b94ee7 100644
--- a/openllm-core/src/openllm_core/config/configuration_falcon.py
+++ b/openllm-core/src/openllm_core/config/configuration_falcon.py
@@ -28,6 +28,7 @@ DEFAULT_PROMPT_TEMPLATE = '''{context}
 {user_name}: {instruction}
 {agent}:
 '''
+
 class FalconConfig(openllm_core.LLMConfig):
   """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
index 191dbdfd..5cda6659 100644
--- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py
+++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
@@ -29,6 +29,7 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
 $ openllm start flan-t5 --model-id google/flan-t5-xxl
 '''
 DEFAULT_PROMPT_TEMPLATE = '''Answer the following question:\nQuestion: {instruction}\nAnswer:'''
+
 class FlanT5Config(openllm_core.LLMConfig):
   """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
 
diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
index 9438c608..9960c309 100644
--- a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
+++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
@@ -24,6 +24,7 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
 $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
 '''
 DEFAULT_PROMPT_TEMPLATE = '''{instruction}'''
+
 class GPTNeoXConfig(openllm_core.LLMConfig):
   """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py
index 20e0207d..d590df28 100644
--- a/openllm-core/src/openllm_core/config/configuration_llama.py
+++ b/openllm-core/src/openllm_core/config/configuration_llama.py
@@ -42,9 +42,12 @@ SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = '[INST]', '[/INST]', '<<SY
 # TODO: support history and v1 prompt implementation
 _v1_prompt, _v2_prompt = '''{instruction}''', '''{start_key} {sys_key}\n{system_message}\n{sys_key}\n\n{instruction}\n{end_key} '''.format(start_key=SINST_KEY, sys_key=SYS_KEY, system_message=SYSTEM_MESSAGE, instruction='{instruction}', end_key=EINST_KEY)
 PROMPT_MAPPING = {'v1': _v1_prompt, 'v2': _v2_prompt}
+
 def _get_prompt(model_type: t.Literal['v1', 'v2']) -> str:
   return PROMPT_MAPPING[model_type]
+
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
+
 class LlamaConfig(openllm_core.LLMConfig):
   """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py
index f62451e9..f12fbd67 100644
--- a/openllm-core/src/openllm_core/config/configuration_mpt.py
+++ b/openllm-core/src/openllm_core/config/configuration_mpt.py
@@ -43,9 +43,12 @@ _chat_prompt, _default_prompt, _instruct_prompt = '''{instruction}''', '''{instr
 {response_key}
 '''.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction='{instruction}', response_key=RESPONSE_KEY)
 PROMPT_MAPPING = {'default': _default_prompt, 'instruct': _instruct_prompt, 'storywriter': _default_prompt, 'chat': _chat_prompt}
+
 def _get_prompt(model_type: str) -> str:
   return PROMPT_MAPPING[model_type]
+
 DEFAULT_PROMPT_TEMPLATE = _get_prompt
+
 class MPTConfig(openllm_core.LLMConfig):
   """MPT is a decoder-style transformer pretrained from scratch on English text and code.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py
index e3da9838..b0cfdd65 100644
--- a/openllm-core/src/openllm_core/config/configuration_opt.py
+++ b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -30,6 +30,7 @@ or provide `--model-id` flag when running ``openllm start opt``:
 $ openllm start opt --model-id facebook/opt-6.7b
 '''
 DEFAULT_PROMPT_TEMPLATE = '''{instruction}'''
+
 class OPTConfig(openllm_core.LLMConfig):
   """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py
index 001d99ab..76e456f6 100644
--- a/openllm-core/src/openllm_core/config/configuration_stablelm.py
+++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py
@@ -29,6 +29,7 @@ SYSTEM_PROMPT = '''<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM will refuse to participate in anything that could harm a human.
 '''
 DEFAULT_PROMPT_TEMPLATE = '''{system_prompt}<|USER|>{instruction}<|ASSISTANT|>'''
+
 class StableLMConfig(openllm_core.LLMConfig):
   """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
 
diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py
index 50a60625..b345b084 100644
--- a/openllm-core/src/openllm_core/config/configuration_starcoder.py
+++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py
@@ -23,6 +23,7 @@ $ openllm start starcoder --model-id 'bigcode/starcoder'
 '''
 DEFAULT_PROMPT_TEMPLATE = '''{instruction}'''
 FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = '<fim-prefix>', '<fim-middle>', '<fim-suffix>', '<fim-pad>', '<|endoftext|>', '<FILL_HERE>'
+
 class StarCoderConfig(openllm_core.LLMConfig):
   """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
 
diff --git a/openllm-core/src/openllm_core/exceptions.py b/openllm-core/src/openllm_core/exceptions.py
index c9ff18c0..60adff51 100644
--- a/openllm-core/src/openllm_core/exceptions.py
+++ b/openllm-core/src/openllm_core/exceptions.py
@@ -2,19 +2,27 @@
 from __future__ import annotations
 
 import bentoml
+
 class OpenLLMException(bentoml.exceptions.BentoMLException):
   '''Base class for all OpenLLM exceptions. This extends BentoMLException.'''
+
 class GpuNotAvailableError(OpenLLMException):
   '''Raised when there is no GPU available in given system.'''
+
 class ValidationError(OpenLLMException):
   '''Raised when a validation fails.'''
+
 class ForbiddenAttributeError(OpenLLMException):
   '''Raised when using an _internal field.'''
+
 class MissingAnnotationAttributeError(OpenLLMException):
   '''Raised when a field under openllm.LLMConfig is missing annotations.'''
+
 class MissingDependencyError(BaseException):
   '''Raised when a dependency is missing.'''
+
 class Error(BaseException):
   '''To be used instead of naked raise.'''
+
 class FineTuneStrategyNotSupportedError(OpenLLMException):
   '''Raised when a fine-tune strategy is not supported for given LLM.'''
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index 4d435425..decaed5a 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -20,28 +20,25 @@ from circus.exc import ConflictError
 
 import openllm_core
 from bentoml._internal.configuration import (
-  DEBUG_ENV_VAR as DEBUG_ENV_VAR,
-  GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR,
-  QUIET_ENV_VAR as QUIET_ENV_VAR,
-  get_debug_mode as _get_debug_mode,
-  get_quiet_mode as _get_quiet_mode,
-  set_quiet_mode as set_quiet_mode,
+    DEBUG_ENV_VAR as DEBUG_ENV_VAR,
+    GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR,
+    QUIET_ENV_VAR as QUIET_ENV_VAR,
+    get_debug_mode as _get_debug_mode,
+    get_quiet_mode as _get_quiet_mode,
+    set_quiet_mode as set_quiet_mode,
 )
 from bentoml._internal.models.model import ModelContext as _ModelContext
 from bentoml._internal.types import LazyType as LazyType
 from bentoml._internal.utils import (
-  LazyLoader as LazyLoader,
-  bentoml_cattr as bentoml_cattr,
-  calc_dir_size as calc_dir_size,
-  first_not_none as first_not_none,
-  pkg as pkg,
-  reserve_free_port as reserve_free_port,
-  resolve_user_filepath as resolve_user_filepath,
-)
-from openllm_core.utils.lazy import (
-  LazyModule as LazyModule,
-  VersionInfo as VersionInfo,
+    LazyLoader as LazyLoader,
+    bentoml_cattr as bentoml_cattr,
+    calc_dir_size as calc_dir_size,
+    first_not_none as first_not_none,
+    pkg as pkg,
+    reserve_free_port as reserve_free_port,
+    resolve_user_filepath as resolve_user_filepath,
 )
+from openllm_core.utils.lazy import (LazyModule as LazyModule, VersionInfo as VersionInfo,)
 if t.TYPE_CHECKING:
   from openllm_core._typing_compat import AnyCallable
 logger = logging.getLogger(__name__)
@@ -53,25 +50,30 @@ if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
 else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType)  # type: ignore #  _GenericAlias is the actual GenericAlias implementation
 
 DEV_DEBUG_VAR = 'OPENLLMDEVDEBUG'
+
 def set_debug_mode(enabled: bool, level: int = 1) -> None:
   # monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs
   if enabled: os.environ[DEV_DEBUG_VAR] = str(level)
   os.environ[DEBUG_ENV_VAR] = str(enabled)
   os.environ[_GRPC_DEBUG_ENV_VAR] = 'DEBUG' if enabled else 'ERROR'
+
 def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool:
   try:
     return isinstance(cls, type) and issubclass(cls, class_or_tuple)  # type: ignore[arg-type]
   except TypeError:
     if isinstance(cls, _WithArgsTypes): return False
     raise
+
 def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any:
   loop = asyncio.get_event_loop()
   if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result()
   else: return loop.run_until_complete(coro)
+
 def available_devices() -> tuple[str, ...]:
   '''Return available GPU under system. Currently only supports NVIDIA GPUs.'''
   from openllm_core._strategies import NvidiaGpuResource
   return tuple(NvidiaGpuResource.from_system())
+
 @functools.lru_cache(maxsize=128)
 def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1') -> str:
   """Generate a hash from given file's modification time.
@@ -84,26 +86,34 @@ def generate_hash_from_file(f: str, algorithm: t.Literal['md5', 'sha1'] = 'sha1'
   The generated hash.
   """
   return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest()
+
 @functools.lru_cache(maxsize=1)
 def device_count() -> int:
   return len(available_devices())
+
 # equivocal setattr to save one lookup per assignment
 _object_setattr = object.__setattr__
+
 def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
   """This makes sure that we don't overwrite any existing attributes on the object."""
   _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
   if not hasattr(obj, name): _setattr(name, value)
+
 def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str:
   return '_'.join(filter(None, map(str.upper, ['OPENLLM', model_name, suffix.strip('_') if suffix else '', key])))
+
 # Special debug flag controled via OPENLLMDEVDEBUG
 DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY = False
 SHOW_CODEGEN: bool = DEBUG and int(os.environ.get('OPENLLMDEVDEBUG', str(0))) > 3
+
 def get_debug_mode() -> bool:
   return DEBUG or _get_debug_mode()
+
 def get_quiet_mode() -> bool:
   return not DEBUG and _get_quiet_mode()
+
 class ExceptionFilter(logging.Filter):
   def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
     '''A filter of all exception.'''
@@ -119,11 +129,14 @@ class ExceptionFilter(logging.Filter):
         for exc in self.EXCLUDE_EXCEPTIONS:
           if issubclass(etype, exc): return False
     return True
+
 class InfoFilter(logging.Filter):
   def filter(self, record: logging.LogRecord) -> bool:
     return logging.INFO <= record.levelno < logging.WARNING
+
 def gen_random_uuid(prefix: str | None = None) -> str:
   return '-'.join([prefix or 'openllm', str(uuid.uuid4().hex)])
+
 _LOGGING_CONFIG: dict[str, t.Any] = {
     'version': 1,
     'disable_existing_loggers': True,
@@ -154,6 +167,7 @@ _LOGGING_CONFIG: dict[str, t.Any] = {
         'level': logging.WARNING
     },
 }
+
 def configure_logging() -> None:
   '''Configure logging for OpenLLM.
 
@@ -173,6 +187,7 @@ def configure_logging() -> None:
     _LOGGING_CONFIG['root']['level'] = logging.INFO
 
   logging.config.dictConfig(_LOGGING_CONFIG)
+
 @functools.lru_cache(maxsize=1)
 def in_notebook() -> bool:
   try:
@@ -182,7 +197,9 @@ def in_notebook() -> bool:
     return 'IPKernelApp' in t.cast('dict[str, t.Any]', t.cast(t.Callable[[], 'InteractiveShell'], get_ipython)().config)
   except (ImportError, AttributeError):
     return False
+
 _dockerenv, _cgroup = Path('/.dockerenv'), Path('/proc/self/cgroup')
+
 class suppress(contextlib.suppress, contextlib.ContextDecorator):
   """A version of contextlib.suppress with decorator support.
 
@@ -191,6 +208,7 @@ class suppress(contextlib.suppress, contextlib.ContextDecorator):
   ...     {}['']
   >>> key_error()
   """
+
 def compose(*funcs: AnyCallable) -> AnyCallable:
   '''Compose any number of unary functions into a single unary function.
 
@@ -211,6 +229,7 @@ def compose(*funcs: AnyCallable) -> AnyCallable:
     return lambda *args, **kwargs: f1(f2(*args, **kwargs))
 
   return functools.reduce(compose_two, funcs)
+
 def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
   """Decorate a function with a transform function that is invoked on results returned from the decorated function.
 
@@ -228,10 +247,12 @@ def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
   ```
   """
   return lambda func: functools.wraps(func)(compose(transform, func))
+
 @apply(bool)
 @suppress(FileNotFoundError)
 def _text_in_file(text: str, filename: Path) -> bool:
   return any(text in line for line in filename.open())
+
 def in_docker() -> bool:
   '''Is this current environment running in docker?
 
@@ -240,15 +261,19 @@ def in_docker() -> bool:
   ```
   '''
   return _dockerenv.exists() or _text_in_file('docker', _cgroup)
+
 T, K = t.TypeVar('T'), t.TypeVar('K')
+
 def resolve_filepath(path: str, ctx: str | None = None) -> str:
   '''Resolve a file path to an absolute path, expand user and environment variables.'''
   try:
     return resolve_user_filepath(path, ctx)
   except FileNotFoundError:
     return path
+
 def validate_is_path(maybe_path: str) -> bool:
   return os.path.exists(os.path.dirname(resolve_filepath(maybe_path)))
+
 def generate_context(framework_name: str) -> _ModelContext:
   framework_versions = {'transformers': pkg.get_pkg_version('transformers')}
   if openllm_core.utils.is_torch_available(): framework_versions['torch'] = pkg.get_pkg_version('torch')
@@ -257,13 +282,16 @@ def generate_context(framework_name: str) -> _ModelContext:
     framework_versions['tensorflow'] = get_tf_version()
   if openllm_core.utils.is_flax_available(): framework_versions.update({'flax': pkg.get_pkg_version('flax'), 'jax': pkg.get_pkg_version('jax'), 'jaxlib': pkg.get_pkg_version('jaxlib')})
   return _ModelContext(framework_name=framework_name, framework_versions=framework_versions)
+
 _TOKENIZER_PREFIX = '_tokenizer_'
+
 def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
   '''Normalize the given attrs to a model and tokenizer kwargs accordingly.'''
   tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
   for k in tuple(attrs.keys()):
     if k.startswith(_TOKENIZER_PREFIX): del attrs[k]
   return attrs, tokenizer_attrs
+
 # NOTE: The set marks contains a set of modules name
 # that are available above and are whitelisted
 # to be included in the extra_objects map.
@@ -312,39 +340,35 @@ _import_structure: dict[str, list[str]] = {
 
 if t.TYPE_CHECKING:
   # NOTE: The following exports useful utils from bentoml
-  from . import (
-    analytics as analytics,
-    codegen as codegen,
-    dantic as dantic,
-  )
+  from . import (analytics as analytics, codegen as codegen, dantic as dantic,)
   from .import_utils import (
-    ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
-    OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
-    DummyMetaclass as DummyMetaclass,
-    EnvVarMixin as EnvVarMixin,
-    is_autogptq_available as is_autogptq_available,
-    is_bitsandbytes_available as is_bitsandbytes_available,
-    is_cpm_kernels_available as is_cpm_kernels_available,
-    is_datasets_available as is_datasets_available,
-    is_einops_available as is_einops_available,
-    is_fairscale_available as is_fairscale_available,
-    is_flax_available as is_flax_available,
-    is_grpc_available as is_grpc_available,
-    is_grpc_health_available as is_grpc_health_available,
-    is_jupyter_available as is_jupyter_available,
-    is_jupytext_available as is_jupytext_available,
-    is_notebook_available as is_notebook_available,
-    is_peft_available as is_peft_available,
-    is_sentencepiece_available as is_sentencepiece_available,
-    is_tf_available as is_tf_available,
-    is_torch_available as is_torch_available,
-    is_transformers_available as is_transformers_available,
-    is_transformers_supports_agent as is_transformers_supports_agent,
-    is_transformers_supports_kbit as is_transformers_supports_kbit,
-    is_triton_available as is_triton_available,
-    is_vllm_available as is_vllm_available,
-    is_xformers_available as is_xformers_available,
-    require_backends as require_backends,
+      ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
+      OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
+      DummyMetaclass as DummyMetaclass,
+      EnvVarMixin as EnvVarMixin,
+      is_autogptq_available as is_autogptq_available,
+      is_bitsandbytes_available as is_bitsandbytes_available,
+      is_cpm_kernels_available as is_cpm_kernels_available,
+      is_datasets_available as is_datasets_available,
+      is_einops_available as is_einops_available,
+      is_fairscale_available as is_fairscale_available,
+      is_flax_available as is_flax_available,
+      is_grpc_available as is_grpc_available,
+      is_grpc_health_available as is_grpc_health_available,
+      is_jupyter_available as is_jupyter_available,
+      is_jupytext_available as is_jupytext_available,
+      is_notebook_available as is_notebook_available,
+      is_peft_available as is_peft_available,
+      is_sentencepiece_available as is_sentencepiece_available,
+      is_tf_available as is_tf_available,
+      is_torch_available as is_torch_available,
+      is_transformers_available as is_transformers_available,
+      is_transformers_supports_agent as is_transformers_supports_agent,
+      is_transformers_supports_kbit as is_transformers_supports_kbit,
+      is_triton_available as is_triton_available,
+      is_vllm_available as is_vllm_available,
+      is_xformers_available as is_xformers_available,
+      require_backends as require_backends,
   )
   from .representation import ReprMixin as ReprMixin
 __lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects=_extras)
diff --git a/openllm-core/src/openllm_core/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py
index ca58b4ec..91b501d7 100644
--- a/openllm-core/src/openllm_core/utils/analytics.py
+++ b/openllm-core/src/openllm_core/utils/analytics.py
@@ -23,12 +23,15 @@ logger = logging.getLogger(__name__)
 # This variable is a proxy that will control BENTOML_DO_NOT_TRACK
 OPENLLM_DO_NOT_TRACK = 'OPENLLM_DO_NOT_TRACK'
 DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()
+
 @functools.lru_cache(maxsize=1)
 def do_not_track() -> bool:
   return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
+
 @functools.lru_cache(maxsize=1)
 def _usage_event_debugging() -> bool:
   return os.environ.get('__BENTOML_DEBUG_USAGE', str(False)).lower() == 'true'
+
 def silent(func: t.Callable[P, T]) -> t.Callable[P, T]:
   @functools.wraps(func)
   def wrapper(*args: P.args, **kwargs: P.kwargs) -> t.Any:
@@ -41,10 +44,12 @@ def silent(func: t.Callable[P, T]) -> t.Callable[P, T]:
       else: logger.debug('Tracking Error: %s', err)
 
   return wrapper
+
 @silent
 def track(event_properties: attr.AttrsInstance) -> None:
   if do_not_track(): return
   _internal_analytics.track(t.cast('_internal_analytics.schemas.EventMeta', event_properties))
+
 @contextlib.contextmanager
 def set_bentoml_tracking() -> t.Generator[None, None, None]:
   original_value = os.environ.pop(_internal_analytics.BENTOML_DO_NOT_TRACK, str(False))
@@ -53,6 +58,7 @@ def set_bentoml_tracking() -> t.Generator[None, None, None]:
     yield
   finally:
     os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = original_value
+
 class EventMeta:
   @property
   def event_name(self) -> str:
@@ -62,10 +68,12 @@ class EventMeta:
     suffix_to_remove = '_event'
     if event_name.endswith(suffix_to_remove): event_name = event_name[:-len(suffix_to_remove)]
     return event_name
+
 @attr.define
 class ModelSaveEvent(EventMeta):
   module: str
   model_size_in_kb: float
+
 @attr.define
 class OpenllmCliEvent(EventMeta):
   cmd_group: str
@@ -75,6 +83,7 @@ class OpenllmCliEvent(EventMeta):
   duration_in_ms: t.Any = attr.field(default=None)
   error_type: str = attr.field(default=None)
   return_code: int = attr.field(default=None)
+
 @attr.define
 class StartInitEvent(EventMeta):
   model_name: str
@@ -83,6 +92,7 @@ class StartInitEvent(EventMeta):
   @staticmethod
   def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent:
     return StartInitEvent(model_name=llm_config['model_name'], llm_config=llm_config.model_dump())
+
 def track_start_init(llm_config: openllm_core.LLMConfig) -> None:
   if do_not_track(): return
   track(StartInitEvent.handler(llm_config))
diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py
index dabd3216..91209c5c 100644
--- a/openllm-core/src/openllm_core/utils/codegen.py
+++ b/openllm-core/src/openllm_core/utils/codegen.py
@@ -18,6 +18,7 @@ logger = logging.getLogger(__name__)
 
 # sentinel object for unequivocal object() getattr
 _sentinel = object()
+
 def has_own_attribute(cls: type[t.Any], attrib_name: t.Any) -> bool:
   """Check whether *cls* defines *attrib_name* (and doesn't just inherit it)."""
   attr = getattr(cls, attrib_name, _sentinel)
@@ -26,14 +27,17 @@ def has_own_attribute(cls: type[t.Any], attrib_name: t.Any) -> bool:
     a = getattr(base_cls, attrib_name, None)
     if attr is a: return False
   return True
+
 def get_annotations(cls: type[t.Any]) -> DictStrAny:
   if has_own_attribute(cls, '__annotations__'): return cls.__annotations__
   return t.cast('DictStrAny', {})
+
 def is_class_var(annot: str | t.Any) -> bool:
   annot = str(annot)
   # Annotation can be quoted.
   if annot.startswith(("'", '"')) and annot.endswith(("'", '"')): annot = annot[1:-1]
   return annot.startswith(('typing.ClassVar', 't.ClassVar', 'ClassVar', 'typing_extensions.ClassVar',))
+
 def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str | None = None) -> _T:
   try:
     method_or_cls.__module__ = cls.__module__
@@ -48,8 +52,10 @@ def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str
   except AttributeError:
     pass
   return method_or_cls
+
 def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = '') -> None:
   eval(compile(script, filename, 'exec'), globs, locs)
+
 def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> AnyCallable:
   locs: DictStrAny = {}
   # In order of debuggers like PDB being able to step through the code, we add a fake linecache entry.
@@ -64,6 +70,7 @@ def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> An
       count += 1
   _compile_and_eval(script, globs, locs, filename)
   return locs[name]
+
 def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.Any]:
   '''Create a tuple subclass to hold class attributes.
 
@@ -86,8 +93,10 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.
   if SHOW_CODEGEN: logger.info('Generated class for %s:\n\n%s', attr_class_name, '\n'.join(attr_class_template))
   _compile_and_eval('\n'.join(attr_class_template), globs)
   return globs[attr_class_name]
+
 def generate_unique_filename(cls: type[t.Any], func_name: str) -> str:
   return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
+
 def generate_function(
     typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None
 ) -> AnyCallable:
@@ -97,6 +106,7 @@ def generate_function(
   if annotations: meth.__annotations__ = annotations
   if SHOW_CODEGEN: logger.info('Generated script for %s:\n\n%s', typ, script)
   return meth
+
 def make_env_transformer(
     cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,
 ) -> AnyCallable:
@@ -123,6 +133,7 @@ def make_env_transformer(
   ]
   fields_ann = 'list[attr.Attribute[t.Any]]'
   return generate_function(cls, '__auto_env', lines, args=('_', 'fields'), globs=globs, annotations={'_': 'type[LLMConfig]', 'fields': fields_ann, 'return': fields_ann})
+
 def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
   '''Enhance sdk with nice repr that plays well with your brain.'''
   from openllm_core.utils import ReprMixin
@@ -153,4 +164,5 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
           func,
       )
   )
+
 __all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function']
diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py
index d6f5cb69..b48e0aeb 100644
--- a/openllm-core/src/openllm_core/utils/dantic.py
+++ b/openllm-core/src/openllm_core/utils/dantic.py
@@ -36,8 +36,10 @@ __all__ = [
     'JsonType',
     'BytesType'
 ]
+
 def __dir__() -> list[str]:
   return sorted(__all__)
+
 def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any = None, suffix_generation: bool = False, suffix_sampling: bool = False) -> t.Callable[[FC], FC]:
   # TODO: support parsing nested attrs class and Union
   envvar = field.metadata['env']
@@ -66,6 +68,7 @@ def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, t
       show_envvar=True,
       envvar=envvar,
   )
+
 def env_converter(value: t.Any, env: str | None = None) -> t.Any:
   if env is not None:
     value = os.environ.get(env, value)
@@ -75,6 +78,7 @@ def env_converter(value: t.Any, env: str | None = None) -> t.Any:
       except orjson.JSONDecodeError as err:
         raise RuntimeError(f"Failed to parse ({value!r}) from '{env}': {err}") from None
   return value
+
 def Field(
     default: t.Any = None,
     *,
@@ -137,6 +141,7 @@ def Field(
     attrs.pop('default')
 
   return attr.field(metadata=metadata, validator=_validator, converter=converter, **attrs)
+
 def parse_type(field_type: t.Any) -> ParamType | tuple[ParamType, ...]:
   """Transforms the pydantic field's type into a click-compatible type.
 
@@ -167,6 +172,7 @@ def parse_type(field_type: t.Any) -> ParamType | tuple[ParamType, ...]:
   if lenient_issubclass(field_type, bytes): return BytesType()
   # return the current type: it should be a primitive
   return field_type
+
 def is_typing(field_type: type) -> bool:
   '''Checks whether the current type is a module-like type.
 
@@ -180,6 +186,7 @@ def is_typing(field_type: type) -> bool:
   if raw is None: return False
   if raw is type or raw is t.Type: return True
   return False
+
 def is_literal(field_type: type) -> bool:
   '''Checks whether the given field type is a Literal type or not.
 
@@ -194,6 +201,7 @@ def is_literal(field_type: type) -> bool:
   '''
   origin = t.get_origin(field_type)
   return origin is not None and origin is t.Literal
+
 class ModuleType(ParamType):
   name = 'module'
 
@@ -215,6 +223,7 @@ class ModuleType(ParamType):
       return value
     except Exception as exc:
       self.fail(f"'{value}' is not a valid object ({type(exc)}: {exc!s})", param, ctx)
+
 class EnumChoice(click.Choice):
   name = 'enum'
 
@@ -237,6 +246,7 @@ class EnumChoice(click.Choice):
     if isinstance(result, str):
       result = self.internal_type[result]
     return result
+
 class LiteralChoice(EnumChoice):
   name = 'literal'
 
@@ -249,6 +259,7 @@ class LiteralChoice(EnumChoice):
     _mapping = {str(v): v for v in values}
     super(EnumChoice, self).__init__(list(_mapping), case_sensitive)
     self.internal_type = item_type
+
 def allows_multiple(field_type: type[t.Any]) -> bool:
   """Checks whether the current type allows for multiple arguments to be provided as input or not.
 
@@ -273,6 +284,7 @@ def allows_multiple(field_type: type[t.Any]) -> bool:
     # For the moment, only non-composite types are allowed.
     return not isinstance(args, tuple)
   return False
+
 def is_mapping(field_type: type) -> bool:
   '''Checks whether this field represents a dictionary or JSON object.
 
@@ -289,6 +301,7 @@ def is_mapping(field_type: type) -> bool:
   origin = t.get_origin(field_type)
   if origin is None: return False
   return lenient_issubclass(origin, t.Mapping)
+
 def is_container(field_type: type) -> bool:
   """Checks whether the current type is a container type ('contains' other types), like lists and tuples.
 
@@ -307,6 +320,7 @@ def is_container(field_type: type) -> bool:
   # Early out for non-typing objects
   if origin is None: return False
   return lenient_issubclass(origin, t.Container)
+
 def parse_container_args(field_type: type[t.Any]) -> ParamType | tuple[ParamType, ...]:
   '''Parses the arguments inside a container type (lists, tuples and so on).
 
@@ -329,6 +343,7 @@ def parse_container_args(field_type: type[t.Any]) -> ParamType | tuple[ParamType
     return parse_single_arg(args[0])
   # Then deal with fixed-length containers: Tuple[str, int, int]
   return tuple(parse_single_arg(arg) for arg in args)
+
 def parse_single_arg(arg: type) -> ParamType:
   """Returns the click-compatible type for container origin types.
 
@@ -349,6 +364,7 @@ def parse_single_arg(arg: type) -> ParamType:
   if is_container(arg): return JsonType()
   if lenient_issubclass(arg, bytes): return BytesType()
   return click_types.convert_type(arg)
+
 class BytesType(ParamType):
   name = 'bytes'
 
@@ -358,6 +374,7 @@ class BytesType(ParamType):
       return str.encode(value)
     except Exception as exc:
       self.fail(f"'{value}' is not a valid string ({exc!s})", param, ctx)
+
 CYGWIN = sys.platform.startswith('cygwin')
 WIN = sys.platform.startswith('win')
 if sys.platform.startswith('win') and WIN:
@@ -369,6 +386,7 @@ else:
 
   def _get_argv_encoding() -> str:
     return getattr(sys.stdin, 'encoding', None) or sys.getfilesystemencoding()
+
 class CudaValueType(ParamType):
   name = 'cuda'
   envvar_list_splitter = ','
@@ -413,7 +431,9 @@ class CudaValueType(ParamType):
 
   def __repr__(self) -> str:
     return 'STRING'
+
 CUDA = CudaValueType()
+
 class JsonType(ParamType):
   name = 'json'
 
diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
index d0dbe21d..15fc9b52 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -28,6 +28,7 @@ USE_TF = os.environ.get('USE_TF', 'AUTO').upper()
 USE_TORCH = os.environ.get('USE_TORCH', 'AUTO').upper()
 USE_JAX = os.environ.get('USE_FLAX', 'AUTO').upper()
 FORCE_TF_AVAILABLE = os.environ.get('FORCE_TF_AVAILABLE', 'AUTO').upper()
+
 def _is_package_available(package: str) -> bool:
   _package_available = importlib.util.find_spec(package) is not None
   if _package_available:
@@ -36,6 +37,7 @@ def _is_package_available(package: str) -> bool:
     except importlib.metadata.PackageNotFoundError:
       _package_available = False
   return _package_available
+
 _torch_available = importlib.util.find_spec('torch') is not None
 _tf_available = importlib.util.find_spec('tensorflow') is not None
 _flax_available = importlib.util.find_spec('jax') is not None and importlib.util.find_spec('flax') is not None
@@ -56,44 +58,64 @@ _autogptq_available = _is_package_available('auto_gptq')
 _sentencepiece_available = _is_package_available('sentencepiece')
 _xformers_available = _is_package_available('xformers')
 _fairscale_available = _is_package_available('fairscale')
+
 def is_transformers_available() -> bool:
   return _transformers_available
+
 def is_grpc_available() -> bool:
   return _grpc_available
+
 def is_grpc_health_available() -> bool:
   return _grpc_health_available
+
 def is_transformers_supports_kbit() -> bool:
   return pkg.pkg_version_info('transformers')[:2] >= (4, 30)
+
 def is_transformers_supports_agent() -> bool:
   return pkg.pkg_version_info('transformers')[:2] >= (4, 29)
+
 def is_jupyter_available() -> bool:
   return _jupyter_available
+
 def is_jupytext_available() -> bool:
   return _jupytext_available
+
 def is_notebook_available() -> bool:
   return _notebook_available
+
 def is_triton_available() -> bool:
   return _triton_available
+
 def is_datasets_available() -> bool:
   return _datasets_available
+
 def is_peft_available() -> bool:
   return _peft_available
+
 def is_einops_available() -> bool:
   return _einops_available
+
 def is_cpm_kernels_available() -> bool:
   return _cpm_kernel_available
+
 def is_bitsandbytes_available() -> bool:
   return _bitsandbytes_available
+
 def is_autogptq_available() -> bool:
   return _autogptq_available
+
 def is_vllm_available() -> bool:
   return _vllm_available
+
 def is_sentencepiece_available() -> bool:
   return _sentencepiece_available
+
 def is_xformers_available() -> bool:
   return _xformers_available
+
 def is_fairscale_available() -> bool:
   return _fairscale_available
+
 def is_torch_available() -> bool:
   global _torch_available
   if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
@@ -106,6 +128,7 @@ def is_torch_available() -> bool:
     logger.info('Disabling PyTorch because USE_TF is set')
     _torch_available = False
   return _torch_available
+
 def is_tf_available() -> bool:
   global _tf_available
   if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES: _tf_available = True
@@ -143,6 +166,7 @@ def is_tf_available() -> bool:
       logger.info('Disabling Tensorflow because USE_TORCH is set')
       _tf_available = False
   return _tf_available
+
 def is_flax_available() -> bool:
   global _flax_available
   if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
@@ -155,6 +179,7 @@ def is_flax_available() -> bool:
   else:
     _flax_available = False
   return _flax_available
+
 VLLM_IMPORT_ERROR_WITH_PYTORCH = '''\
 {0} requires the vLLM library but it was not found in your environment.
 However, we were able to find a PyTorch installation. PyTorch classes do not begin
@@ -270,6 +295,7 @@ BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available,
 ), ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), (
     'fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR)
 )])
+
 class DummyMetaclass(abc.ABCMeta):
   '''Metaclass for dummy object.
 
@@ -280,6 +306,7 @@ class DummyMetaclass(abc.ABCMeta):
   def __getattribute__(cls, key: str) -> t.Any:
     if key.startswith('_'): return super().__getattribute__(key)
     require_backends(cls, cls._backends)
+
 def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None:
   if not isinstance(backends, (list, tuple)): backends = list(backends)
   name = o.__name__ if hasattr(o, '__name__') else o.__class__.__name__
@@ -294,6 +321,7 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None:
     if 'flax' not in backends and is_flax_available() and not is_vllm_available(): raise ImportError(VLLM_IMPORT_ERROR_WITH_FLAX.format(name))
   failed = [msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available()]
   if failed: raise ImportError(''.join(failed))
+
 class EnvVarMixin(ReprMixin):
   model_name: str
   config: str
diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py
index 92b0aebd..73b7d5ff 100644
--- a/openllm-core/src/openllm_core/utils/lazy.py
+++ b/openllm-core/src/openllm_core/utils/lazy.py
@@ -15,6 +15,7 @@ import attr
 
 import openllm_core
 __all__ = ['VersionInfo', 'LazyModule']
+
 # vendorred from attrs
 @functools.total_ordering
 @attr.attrs(eq=False, order=False, slots=True, frozen=True, repr=False)
@@ -53,7 +54,9 @@ class VersionInfo:
 
   def __repr__(self) -> str:
     return '{0}.{1}.{2}'.format(*attr.astuple(self)[:3])
+
 _sentinel, _reserved_namespace = object(), {'__openllm_migration__'}
+
 class LazyModule(types.ModuleType):
   # Very heavily inspired by optuna.integration._IntegrationModule: https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
   def __init__(
diff --git a/openllm-core/src/openllm_core/utils/representation.py b/openllm-core/src/openllm_core/utils/representation.py
index 3b75b2d9..733562c3 100644
--- a/openllm-core/src/openllm_core/utils/representation.py
+++ b/openllm-core/src/openllm_core/utils/representation.py
@@ -9,6 +9,7 @@ from openllm_core import utils
 if t.TYPE_CHECKING: from openllm_core._typing_compat import TypeAlias
 
 ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None]
+
 class ReprMixin:
   @property
   @abstractmethod
diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py
index 720825f9..7ef98cf9 100644
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -13,6 +13,7 @@ if t.TYPE_CHECKING: import torch
 
 _GENERIC_EMBEDDING_ID = 'sentence-transformers/all-MiniLM-L6-v2'
 _BENTOMODEL_ID = 'sentence-transformers--all-MiniLM-L6-v2'
+
 def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
   try:
     return bentoml.transformers.get(ids)
@@ -36,6 +37,7 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
           _GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt']
       )
       return bentomodel
+
 class GenericEmbeddingRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
@@ -67,4 +69,5 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
     token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
 __all__ = ['GenericEmbeddingRunnable']
diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py
index 1c4e8fdc..52a3af15 100644
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -8,6 +8,7 @@ if t.TYPE_CHECKING: import torch, openllm
 # reexport from transformers
 LogitsProcessorList = transformers.LogitsProcessorList
 StoppingCriteriaList = transformers.StoppingCriteriaList
+
 class StopSequenceCriteria(transformers.StoppingCriteria):
   def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
     if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
@@ -15,9 +16,11 @@ class StopSequenceCriteria(transformers.StoppingCriteria):
 
   def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
     return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
+
 class StopOnTokens(transformers.StoppingCriteria):
   def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
     return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}
+
 def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
   generation_config = config.generation_config
   logits_processor = transformers.LogitsProcessorList()
@@ -26,16 +29,20 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
   if 1e-8 <= generation_config['top_p']: logits_processor.append(transformers.TopPLogitsWarper(generation_config['top_p']))
   if generation_config['top_k'] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config['top_k']))
   return logits_processor
+
 # NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used.
 SEQLEN_KEYS = ['max_sequence_length', 'seq_length', 'max_position_embeddings', 'max_seq_len', 'model_max_length']
+
 def get_context_length(config: transformers.PretrainedConfig) -> int:
   rope_scaling = getattr(config, 'rope_scaling', None)
   rope_scaling_factor = config.rope_scaling['factor'] if rope_scaling else 1.0
   for key in SEQLEN_KEYS:
     if getattr(config, key, None) is not None: return int(rope_scaling_factor * getattr(config, key))
   return 2048
+
 def is_sentence_complete(output: str) -> bool:
   return output.endswith(('.', '?', '!', '...', '。', '?', '!', '…', '"', "'", '”'))
+
 def is_partial_stop(output: str, stop_str: str) -> bool:
   '''Check whether the output contains a partial stop str.'''
   for i in range(0, min(len(output), len(stop_str))):
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index 4505f05b..954cf4d4 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -49,15 +49,19 @@ else:
 ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConfig', str]]]
 
 logger = logging.getLogger(__name__)
+
 class ModelSignatureDict(t.TypedDict, total=False):
   batchable: bool
   batch_dim: t.Union[t.Tuple[int, int], int]
   input_spec: NotRequired[t.Union[t.Any, t.Tuple[t.Any]]]
   output_spec: NotRequired[t.Any]
+
 def normalise_model_name(name: str) -> str:
   return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else re.sub('[^a-zA-Z0-9]+', '-', name)
+
 # the below is similar to peft.utils.other.CONFIG_NAME
 PEFT_CONFIG_NAME = 'adapter_config.json'
+
 def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapping:
   '''Resolve the type of the PeftConfig given the adapter_map.
 
@@ -88,7 +92,9 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp
     if _peft_type not in resolved: resolved[_peft_type] = ()
     resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),)
   return resolved
+
 _reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'}
+
 class LLMInterface(abc.ABC, t.Generic[M, T]):
   '''This defines the loose contract for all openllm.LLM implementations.'''
   @property
@@ -241,23 +247,31 @@ class LLMInterface(abc.ABC, t.Generic[M, T]):
         **attrs: t.Any
     ) -> None:
       '''Generated __attrs_init__ for openllm.LLM.'''
+
 _R = t.TypeVar('_R', covariant=True)
+
 class _import_model_wrapper(t.Generic[_R, M, T], t.Protocol):
   def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
     ...
+
 class _load_model_wrapper(t.Generic[M, T], t.Protocol):
   def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
     ...
+
 class _load_tokenizer_wrapper(t.Generic[M, T], t.Protocol):
   def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
     ...
+
 class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol):
   def __call__(self, llm: LLM[M, T]) -> T:
     ...
+
 class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol):
   def __call__(self, llm: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
     ...
+
 _object_setattr = object.__setattr__
+
 # NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation.
 def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
   @functools.wraps(f)
@@ -269,11 +283,14 @@ def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Ca
     return f(self, *decls, trust_remote_code=trust_remote_code, **attrs)
 
   return wrapper
+
 _DEFAULT_TOKENIZER = 'hf-internal-testing/llama-tokenizer'
+
 def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs:
   return vllm.EngineArgs(
       model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode='auto', tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype='auto', worker_use_ray=False
   )
+
 def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
   @functools.wraps(f)
   def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
@@ -289,12 +306,14 @@ def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]],
       return f(self, *(*model_decls, *decls), **{**model_attrs, **attrs})
 
   return wrapper
+
 def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]) -> t.Callable[[LLM[M, T]], T]:
   @functools.wraps(f)
   def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
     return f(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
 
   return wrapper
+
 def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]:
   @functools.wraps(f)
   def wrapper(self: LLM[M, T]) -> None:
@@ -302,6 +321,7 @@ def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M
     f(self)
 
   return wrapper
+
 def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | pathlib.Path], None]:
   @functools.wraps(f)
   def wrapper(self: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None:
@@ -312,6 +332,7 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[L
     f(self, save_directory, **attrs)
 
   return wrapper
+
 def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
   # update docstring for given entrypoint
   original_fn = getattr(cls, fn, getattr(LLMInterface, fn))
@@ -323,6 +344,7 @@ def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable:
     '''
   setattr(cls, fn, original_fn)
   return original_fn
+
 def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]:
   attributes = {
       'import_model': _wrapped_import_model,
@@ -361,8 +383,10 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
     lines.extend([_setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"), f"__gen_docstring(cls, '{fn}')",])
     anns[key] = interface_anns.get(key)
   return codegen.generate_function(cls, '__assign_llm_attr', lines, args=('cls', *args), globs=globs, annotations=anns)
+
 def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
   return generation_result[0]['outputs'][0]['text']
+
 def vllm_generate_iterator(
     self: LLM['vllm.LLMEngine', T], prompt: str, /, *, echo: bool = False, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any
 ) -> t.Iterator[dict[str, t.Any]]:
@@ -387,6 +411,7 @@ def vllm_generate_iterator(
       else: text_outputs = [output.text for output in request_output.outputs]
       yield {'text': text_outputs, 'error_code': 0}
       if request_output.finished: break
+
 def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
   request_id: str = attrs.pop('request_id', None)
   if request_id is None: raise ValueError('request_id must not be None.')
@@ -396,7 +421,9 @@ def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -
   while self.model.has_unfinished_requests():
     outputs.extend([r for r in self.model.step() if r.finished])
   return [unmarshal_vllm_outputs(i) for i in outputs]
+
 _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class('AdaptersTuple', ['adapter_id', 'name', 'config'])
+
 @attr.define(slots=True, repr=False, init=False)
 class LLM(LLMInterface[M, T], ReprMixin):
   if t.TYPE_CHECKING: __name__: str
@@ -1140,6 +1167,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
     del past_key_values, out
     gc.collect()
     torch.cuda.empty_cache()
+
 # fmt: off
 @overload
 def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py
index da02f3ac..18c2952e 100644
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -14,12 +14,15 @@ autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'),
 logger = logging.getLogger(__name__)
 
 QuantiseMode = t.Literal['int8', 'int4', 'gptq']
+
 @overload
 def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
   ...
+
 @overload
 def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
   ...
+
 def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
   # 8 bit configuration
   int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index e94605f5..650fdf28 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -38,17 +38,20 @@ runners: list[AbstractRunner] = [runner]
 if not runner.supports_embeddings: runners.append(generic_embedding_runner)
 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
 _JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})
+
 @svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
   qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
   config = qa_inputs.llm_config.model_dump()
   responses = await runner.generate.async_run(qa_inputs.prompt, **{'adapter_name': qa_inputs.adapter_name, **config})
   return openllm.GenerationOutput(responses=responses, configuration=config)
+
 @svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream'))
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
   echo = input_dict.pop('echo', False)
   qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
   return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
+
 @svc.api(
     route='/v1/metadata',
     input=bentoml.io.Text(),
@@ -72,6 +75,7 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
       supports_embeddings=runner.supports_embeddings,
       supports_hf_agent=runner.supports_hf_agent
   )
+
 @svc.api(
     route='/v1/embeddings',
     input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
@@ -111,6 +115,7 @@ async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
   embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode  # type: ignore[type-arg,assignment,valid-type]
   responses = (await embed_call.async_run(phrases))[0]
   return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens'])
+
 if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
 
   async def hf_agent(request: Request) -> Response:
@@ -127,11 +132,13 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
 
   hf_app = Starlette(debug=True, routes=[Route('/agent', hf_agent, methods=['POST'])])
   svc.mount_asgi_app(hf_app, path='/hf')
+
 # general metadata app
 async def list_adapter_v1(_: Request) -> Response:
   res: dict[str, t.Any] = {}
   if runner.peft_adapters['success'] is True: res['result'] = {k: v.to_dict() for k, v in runner.peft_adapters['result'].items()}
   res.update({'success': runner.peft_adapters['success'], 'error_msg': runner.peft_adapters['error_msg']})
   return JSONResponse(res, status_code=200)
+
 adapters_app_v1 = Starlette(debug=True, routes=[Route('/adapters', list_adapter_v1, methods=['GET'])])
 svc.mount_asgi_app(adapters_app_v1, path='/v1')
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index fcfe317a..3e0fd119 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -30,6 +30,7 @@ if t.TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
+
 def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
   '''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
   if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None
@@ -48,6 +49,7 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope
       env.install(builder.build_system_requires)
       return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
   raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
+
 def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions:
   packages = ['openllm', 'scipy']  # apparently bnb misses this one
   if adapter_map is not None: packages += ['openllm[fine-tune]']
@@ -100,6 +102,7 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
   ]
   if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
   return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=['https://download.pytorch.org/whl/cu118'])
+
 def construct_docker_options(
     llm: openllm.LLM[t.Any, t.Any],
     _: FS,
@@ -137,8 +140,10 @@ def construct_docker_options(
   if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
   env_dict[_env.runtime] = _env['runtime_value']
   return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
+
 OPENLLM_MODEL_NAME = '# openllm: model name'
 OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
+
 class ModelNameFormatter(string.Formatter):
   model_keyword: LiteralString = '__model_name__'
 
@@ -156,11 +161,15 @@ class ModelNameFormatter(string.Formatter):
       return True
     except ValueError:
       return False
+
 class ModelIdFormatter(ModelNameFormatter):
   model_keyword: LiteralString = '__model_id__'
+
 class ModelAdapterMapFormatter(ModelNameFormatter):
   model_keyword: LiteralString = '__model_adapter_map__'
+
 _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
+
 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
   from openllm_core.utils import DEBUG
   model_name = llm.config['model_name']
@@ -174,6 +183,7 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N
   script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
   if DEBUG: logger.info('Generated script:\n%s', script)
   llm_fs.writetext(llm.config['service_name'], script)
+
 @inject
 def create_bento(
     bento_tag: bentoml.Tag,
diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py
index bcf91a2d..328c1b48 100644
--- a/openllm-python/src/openllm/bundle/oci/__init__.py
+++ b/openllm-python/src/openllm/bundle/oci/__init__.py
@@ -40,17 +40,23 @@ _OWNER = 'bentoml'
 _REPO = 'openllm'
 
 _module_location = openllm_core.utils.pkg.source_locations('openllm')
+
 @functools.lru_cache
 @openllm_core.utils.apply(str.lower)
 def get_base_container_name(reg: LiteralContainerRegistry) -> str:
   return _CONTAINER_REGISTRY[reg]
+
 def _convert_version_from_string(s: str) -> VersionInfo:
   return VersionInfo.from_version_string(s)
+
 def _commit_time_range(r: int = 5) -> str:
   return (datetime.now(timezone.utc) - timedelta(days=r)).strftime('%Y-%m-%dT%H:%M:%SZ')
+
 class VersionNotSupported(openllm.exceptions.OpenLLMException):
   """Raised when the stable release is too low that it doesn't include OpenLLM base container."""
+
 _RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
+
 def nightly_resolver(cls: type[RefResolver]) -> str:
   # NOTE: all openllm container will have sha-<git_hash[:7]>
   # This will use docker to run skopeo to determine the correct latest tag that is available
@@ -64,6 +70,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
     return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
   # now is the correct behaviour
   return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
+
 @attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
   git_hash: str = attr.field()
@@ -108,9 +115,11 @@ class RefResolver:
     if self.strategy == 'latest': return 'latest'
     elif self.strategy == 'nightly': return self.git_hash
     else: return repr(self.version)
+
 @functools.lru_cache(maxsize=256)
 def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
   return RefResolver.from_strategy(strategy).tag
+
 def build_container(
     registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
     version_strategy: LiteralContainerVersionStrategy = 'release',
@@ -146,13 +155,16 @@ def build_container(
   except Exception as err:
     raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
   return tags
+
 if t.TYPE_CHECKING:
   CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
   supported_registries: list[str]
 
 __all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
+
 def __dir__() -> list[str]:
   return sorted(__all__)
+
 def __getattr__(name: str) -> t.Any:
   if name == 'supported_registries': return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))()
   elif name == 'CONTAINER_NAMES': return _CONTAINER_REGISTRY
diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py
index 729e8992..f3fc3b07 100644
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -31,10 +31,13 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
 
 _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])
+
 def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
   return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
+
 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
   return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+
 def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
   # TODO: Support amd.com/gpu on k8s
   _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
@@ -55,7 +58,9 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
   environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
   if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
   return environ
+
 _adapter_mapping_key = 'adapter_map'
+
 def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...] | None) -> None:
   if not value: return None
   if _adapter_mapping_key not in ctx.params: ctx.params[_adapter_mapping_key] = {}
@@ -69,6 +74,7 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
       pass
     ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
   return None
+
 def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
   llm_config = openllm.AutoConfig.for_model(model)
   command_attrs: DictStrAny = dict(
@@ -212,6 +218,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
     return config
 
   return start_cmd
+
 def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command:
   context_settings = command_attrs.pop('context_settings', {})
   context_settings.update({'ignore_unknown_options': True, 'allow_extra_args': True})
@@ -224,6 +231,7 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
     return llm_config
 
   return noop
+
 def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
   if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
   if quantize and llm_config.default_implementation() == 'vllm':
@@ -232,6 +240,7 @@ def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: Lite
   if requirements is not None and len(requirements) > 0:
     missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
     if len(missing_requirements) > 0: termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
+
 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
   def wrapper(fn: FC) -> t.Callable[[FC], FC]:
     composed = openllm.utils.compose(
@@ -301,6 +310,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
     return composed(fn)
 
   return wrapper
+
 def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
   if value is None: return value
   if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
@@ -308,10 +318,12 @@ def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tup
   # NOTE: --device all is a special case
   if len(el) == 1 and el[0] == 'all': return tuple(map(str, openllm.utils.available_devices()))
   return el
+
 # NOTE: A list of bentoml option that is not needed for parsing.
 # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this.
 # NOTE: production is also deprecated
 _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}
+
 def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
   '''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
   from bentoml_cli.cli import cli
@@ -339,7 +351,9 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
     return group(f)
 
   return decorator
+
 _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True)
+
 def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
   '''General ``@click`` decorator with some sauce.
 
@@ -356,8 +370,10 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
     return t.cast(FC, callback(*param_decls, **attrs)(f) if f is not None else callback(*param_decls, **attrs))
 
   return decorator
+
 cli_option = functools.partial(_click_factory_type, attr='option')
 cli_argument = functools.partial(_click_factory_type, attr='argument')
+
 def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
   output = ['json', 'pretty', 'porcelain']
 
@@ -377,6 +393,7 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput
       shell_complete=complete_output_var,
       **attrs
   )(f)
+
 def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--fast/--no-fast',
@@ -390,10 +407,13 @@ def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
                                                                                                           ''',
       **attrs
   )(f)
+
 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
+
 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)
+
 def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--model-id',
@@ -404,10 +424,13 @@ def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.E
       help='Optional model_id name or path for (fine-tune) weight.',
       **attrs
   )(f)
+
 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
+
 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
+
 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--quantise',
@@ -433,6 +456,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
       > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
       **attrs
   )(f)
+
 def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--workers-per-resource',
@@ -458,6 +482,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
       ),
       **attrs
   )(f)
+
 def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--bettertransformer',
@@ -469,6 +494,7 @@ def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = Fal
       if not build else 'Set default environment variable whether to serve this model with FasterTransformer in build time.',
       **attrs
   )(f)
+
 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--serialisation',
@@ -498,6 +524,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
       ''',
       **attrs
   )(f)
+
 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
   return cli_option(
       '--container-registry',
@@ -517,7 +544,9 @@ def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) ->
       ''',
       **attrs
   )(f)
+
 _wpr_strategies = {'round_robin', 'conserved'}
+
 def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
   if value is None: return value
   value = inflection.underscore(value)
@@ -529,6 +558,7 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
       raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
     else:
       return value
+
 def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
   if value is None: return value
   if value not in openllm.bundle.supported_registries: raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
index d32cbc67..3dedf91f 100644
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -22,6 +22,7 @@ if t.TYPE_CHECKING:
   from openllm_core._configuration import LLMConfig
   from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy, LiteralRuntime, LiteralString
 logger = logging.getLogger(__name__)
+
 def _start(
     model_name: str,
     /,
@@ -108,6 +109,7 @@ def _start(
   return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
       args=args if len(args) > 0 else None, standalone_mode=False
   )
+
 @inject
 def _build(
     model_name: str,
@@ -213,6 +215,7 @@ def _build(
   if matched is None:
     raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
   return bentoml.get(matched.group(1), _bento_store=bento_store)
+
 def _import_model(
     model_name: str,
     /,
@@ -262,6 +265,7 @@ def _import_model(
   if additional_args is not None: args.extend(additional_args)
   if quantize is not None: args.extend(['--quantize', quantize])
   return import_command.main(args=args, standalone_mode=False)
+
 def _list_models() -> dict[str, t.Any]:
   '''List all available models within the local store.'''
   from .entrypoint import models_command
diff --git a/openllm-python/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py
index c2a9af60..5ced51fd 100644
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -8,6 +8,7 @@ import openllm
 from openllm.cli import termui
 from openllm.cli._factory import container_registry_option, machine_option
 if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
+
 @click.command(
     'build_base_container',
     context_settings=termui.CONTEXT_SETTINGS,
diff --git a/openllm-python/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py
index 3cf4ea31..8c59f667 100644
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -12,6 +12,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm.cli import termui
 from openllm.cli._factory import bento_complete_envvar, machine_option
 if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
+
 @click.command('dive_bentos', context_settings=termui.CONTEXT_SETTINGS)
 @click.argument('bento', type=str, shell_complete=bento_complete_envvar)
 @machine_option
diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py
index d6683844..0e0fb5a8 100644
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -13,6 +13,7 @@ from openllm.cli import termui
 from openllm.cli._factory import bento_complete_envvar
 from openllm_core.utils import bentoml_cattr
 if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
+
 @click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
 @click.argument('bento', type=str, shell_complete=bento_complete_envvar)
 @click.pass_context
diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py
index 995d8ac1..ef8d944c 100644
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -11,6 +11,7 @@ from openllm.cli import termui
 from openllm.cli._factory import machine_option, model_complete_envvar, output_option
 from openllm_core._prompt import process_prompt
 LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
+
 @click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
 @click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
 @click.argument('prompt', type=click.STRING)
diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py
index fe8c832c..e5320451 100644
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -9,6 +9,7 @@ import openllm
 from bentoml._internal.utils import human_readable_size
 from openllm.cli import termui
 from openllm.cli._factory import LiteralOutput, output_option
+
 @click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
 @output_option(default_value='json')
 @click.pass_context
diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py
index b5ab145a..5ec7814b 100644
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -11,6 +11,7 @@ from bentoml._internal.utils import human_readable_size
 from openllm.cli import termui
 from openllm.cli._factory import LiteralOutput, model_complete_envvar, model_name_argument, output_option
 if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
+
 @click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
 @model_name_argument(required=False, shell_complete=model_complete_envvar)
 @output_option(default_value='json')
diff --git a/openllm-python/src/openllm/cli/extension/playground.py b/openllm-python/src/openllm/cli/extension/playground.py
index afccb5f0..80d74d33 100644
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -20,11 +20,13 @@ if t.TYPE_CHECKING:
 
   from openllm_core._typing_compat import DictStrAny
 logger = logging.getLogger(__name__)
+
 def load_notebook_metadata() -> DictStrAny:
   with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f:
     content = yaml.safe_load(f)
   if not all('description' in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
   return content
+
 @click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
 @click.argument('output-dir', default=None, required=False)
 @click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
diff --git a/openllm-python/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py
index 0ef9891e..90c396b4 100644
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -7,9 +7,11 @@ import inflection
 
 import openllm
 if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
+
 def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None:
   attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None
   if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
+
 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
 CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
 __all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']
diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py
index ee428f92..b3339ea0 100644
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -15,7 +15,9 @@ import typing as t
 
 import openllm_client
 if t.TYPE_CHECKING:  from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
+
 def __dir__() -> t.Sequence[str]:
   return sorted(dir(openllm_client))
+
 def __getattr__(it: str) -> t.Any:
   return getattr(openllm_client, it)
diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py
index e7a45f1e..36d9bd9b 100644
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -22,6 +22,7 @@ if t.TYPE_CHECKING:
   ConfigModelItemsView = _odict_items[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]]
 
 logger = logging.getLogger(__name__)
+
 class BaseAutoLLMClass:
   _model_mapping: t.ClassVar[_LazyAutoMapping]
 
@@ -81,6 +82,7 @@ class BaseAutoLLMClass:
     raise ValueError(
         f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])})."
     )
+
 def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
   if attr is None: return
   if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr)
@@ -93,6 +95,7 @@ def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
     except ValueError:
       raise ValueError(f'Could not find {attr} neither in {module} nor in {openllm_module}!') from None
   raise ValueError(f'Could not find {attr} in {openllm_module}!')
+
 class _LazyAutoMapping(OrderedDict, ReprMixin):
   """Based on transformers.models.auto.configuration_auto._LazyAutoMapping.
 
@@ -168,4 +171,5 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
     if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping:
       if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.")
     self._extra_content[key] = value
+
 __all__ = ['BaseAutoLLMClass', '_LazyAutoMapping']
diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py
index 7740aba2..b28564d2 100644
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -9,5 +9,6 @@ MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2
     'opt', 'OPT'
 ), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
+
 class AutoLLM(BaseAutoLLMClass):
   _model_mapping: t.ClassVar = MODEL_MAPPING
diff --git a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
index 0341aea3..afd236b2 100644
--- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
@@ -7,5 +7,6 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
 MODEL_FLAX_MAPPING_NAMES = OrderedDict([('flan_t5', 'FlaxFlanT5'), ('opt', 'FlaxOPT')])
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
+
 class AutoFlaxLLM(BaseAutoLLMClass):
   _model_mapping: t.ClassVar = MODEL_FLAX_MAPPING
diff --git a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
index c1b92529..b7cf02c0 100644
--- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
@@ -7,5 +7,6 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
 MODEL_TF_MAPPING_NAMES = OrderedDict([('flan_t5', 'TFFlanT5'), ('opt', 'TFOPT')])
 MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
+
 class AutoTFLLM(BaseAutoLLMClass):
   _model_mapping: t.ClassVar = MODEL_TF_MAPPING
diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
index 37c7310a..2e387898 100644
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -9,5 +9,6 @@ MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2
     'opt', 'VLLMOPT'
 ), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
+
 class AutoVLLM(BaseAutoLLMClass):
   _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING
diff --git a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
index d349a658..75a52794 100644
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers
+
 class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
index e9c5a134..44ea3d2e 100644
--- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import vllm, transformers
+
 class VLLMBaichuan(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
index e76ef17f..52c05ccd 100644
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers
+
 class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index d930c85b..868e6722 100644
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -9,12 +9,15 @@ from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE,
 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
 else:  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
 logger = logging.getLogger(__name__)
+
 @overload
 def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
   ...
+
 @overload
 def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
   ...
+
 def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
   # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
   class InstructionTextGenerationPipeline(transformers.Pipeline):
@@ -115,6 +118,7 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
       return records
 
   return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline
+
 class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedTokenizer']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
index f22f79d6..8d34a2d1 100644
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
@@ -6,6 +6,7 @@ import openllm
 if t.TYPE_CHECKING: import vllm, transformers
 
 logger = logging.getLogger(__name__)
+
 class VLLMDollyV2(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizer']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
index b16cd7cf..d32151de 100644
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -4,6 +4,7 @@ import typing as t
 import openllm
 if t.TYPE_CHECKING: import torch, transformers
 else: torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
+
 class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
index 61c4aa1d..95c9ac22 100644
--- a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
@@ -6,6 +6,7 @@ import openllm
 if t.TYPE_CHECKING: import vllm, transformers
 
 logger = logging.getLogger(__name__)
+
 class VLLMFalcon(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
index fb421edf..bce584c4 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers
+
 class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
index 7a13fd15..40354f0a 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -5,6 +5,7 @@ import openllm
 from openllm_core._prompt import process_prompt
 from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import transformers
+
 class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
index 6af703fe..eafb9946 100644
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers
+
 class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
index d2661f45..f35b54da 100644
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -6,6 +6,7 @@ import openllm
 if t.TYPE_CHECKING: import transformers
 
 logger = logging.getLogger(__name__)
+
 class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
index 818871fe..f91b3e01 100644
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import vllm, transformers
+
 class VLLMGPTNeoX(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py
index 148edf1f..c8eb6632 100644
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers
+
 class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaTokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
index 54c0a875..c02eb0b1 100644
--- a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
@@ -3,5 +3,6 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import vllm, transformers
+
 class VLLMLlama(openllm.LLM['vllm.LLMEngine', 'transformers.LlamaTokenizerFast']):
   __openllm_internal__ = True
diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
index 394e63ae..d6725ff1 100644
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -8,6 +8,7 @@ from openllm.utils import generate_labels, is_triton_available
 if t.TYPE_CHECKING: import transformers, torch
 
 logger = logging.getLogger(__name__)
+
 def get_mpt_config(
     model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True
 ) -> transformers.PretrainedConfig:
@@ -22,6 +23,7 @@ def get_mpt_config(
   # setting max_seq_len
   config.max_seq_len = max_sequence_length
   return config
+
 class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
index f816b343..19334d27 100644
--- a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers, vllm
+
 class VLLMMPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
index d48fe8cf..23c08479 100644
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -11,6 +11,7 @@ if t.TYPE_CHECKING: import transformers
 else: transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')
 
 logger = logging.getLogger(__name__)
+
 class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py
index be954ba8..596bc49e 100644
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -6,6 +6,7 @@ import openllm
 if t.TYPE_CHECKING: import transformers
 
 logger = logging.getLogger(__name__)
+
 class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
index 6c30f6a9..41700ac0 100644
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -5,6 +5,7 @@ import bentoml
 import openllm
 from openllm_core.utils import generate_labels
 if t.TYPE_CHECKING: import transformers
+
 class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
index 9e87ad60..778ebb6f 100644
--- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
@@ -5,6 +5,7 @@ import openllm
 from openllm_core._prompt import process_prompt
 from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers
+
 class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
index 01290c2e..0fd4bfe7 100644
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -3,6 +3,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import transformers
+
 class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
index 1d02d02c..d9f1703e 100644
--- a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
@@ -4,6 +4,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import vllm, transformers
+
 class VLLMStableLM(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
index 5812ab96..de251e0d 100644
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -7,6 +7,7 @@ import openllm
 from openllm.utils import generate_labels
 from openllm_core.config.configuration_starcoder import EOD, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
 if t.TYPE_CHECKING: import transformers
+
 class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.GPT2TokenizerFast']):
   __openllm_internal__ = True
 
diff --git a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
index 20a9e822..87278717 100644
--- a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
@@ -4,6 +4,7 @@ import typing as t
 
 import openllm
 if t.TYPE_CHECKING: import vllm, transformers
+
 class VLLMStarCoder(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2TokenizerFast']):
   __openllm_internal__ = True
   tokenizer_id = 'local'
diff --git a/openllm-python/src/openllm/playground/falcon_tuned.py b/openllm-python/src/openllm/playground/falcon_tuned.py
index c5dc9025..17ba7924 100644
--- a/openllm-python/src/openllm/playground/falcon_tuned.py
+++ b/openllm-python/src/openllm/playground/falcon_tuned.py
@@ -24,6 +24,7 @@ from datasets import load_dataset
 from trl import SFTTrainer
 DEFAULT_MODEL_ID = "ybelkada/falcon-7b-sharded-bf16"
 DATASET_NAME = "timdettmers/openassistant-guanaco"
+
 @dataclasses.dataclass
 class TrainingArguments:
   per_device_train_batch_size: int = dataclasses.field(default=4)
@@ -40,10 +41,12 @@ class TrainingArguments:
   group_by_length: bool = dataclasses.field(default=True)
   lr_scheduler_type: str = dataclasses.field(default="constant")
   output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "falcon"))
+
 @dataclasses.dataclass
 class ModelArguments:
   model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
   max_sequence_length: int = dataclasses.field(default=512)
+
 parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
 if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
   # If we pass only one argument to the script and it's the path to a json file,
diff --git a/openllm-python/src/openllm/playground/features.py b/openllm-python/src/openllm/playground/features.py
index 2d31b5a7..c6776759 100644
--- a/openllm-python/src/openllm/playground/features.py
+++ b/openllm-python/src/openllm/playground/features.py
@@ -12,6 +12,7 @@ MAX_NEW_TOKENS = 384
 
 Q = "Answer the following question, step by step:\n{q}\nA:"
 question = "What is the meaning of life?"
+
 def main() -> int:
   parser = argparse.ArgumentParser()
   parser.add_argument("question", default=question)
@@ -42,9 +43,11 @@ def main() -> int:
   logger.info("=" * 10, "Response:", r.llm.postprocess_generate(prompt, res))
 
   return 0
+
 def _mp_fn(index: t.Any):  # noqa # type: ignore
   # For xla_spawn (TPUs)
   main()
+
 if openllm.utils.in_notebook():
   main()
 else:
diff --git a/openllm-python/src/openllm/playground/llama2_qlora.py b/openllm-python/src/openllm/playground/llama2_qlora.py
index b867c174..a1e72fd4 100644
--- a/openllm-python/src/openllm/playground/llama2_qlora.py
+++ b/openllm-python/src/openllm/playground/llama2_qlora.py
@@ -29,6 +29,7 @@ from random import randint, randrange
 
 import bitsandbytes as bnb
 from datasets import load_dataset
+
 # COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
 def find_all_linear_names(model):
   lora_module_names = set()
@@ -40,11 +41,13 @@ def find_all_linear_names(model):
   if "lm_head" in lora_module_names:  # needed for 16-bit
     lora_module_names.remove("lm_head")
   return list(lora_module_names)
+
 # Change this to the local converted path if you don't have access to the meta-llama model
 DEFAULT_MODEL_ID = "meta-llama/Llama-2-7b-hf"
 # change this to 'main' if you want to use the latest llama
 DEFAULT_MODEL_VERSION = "335a02887eb6684d487240bbc28b5699298c3135"
 DATASET_NAME = "databricks/databricks-dolly-15k"
+
 def format_dolly(sample):
   instruction = f"### Instruction\n{sample['instruction']}"
   context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
@@ -52,12 +55,15 @@ def format_dolly(sample):
   # join all the parts together
   prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
   return prompt
+
 # template dataset to add prompt to each sample
 def template_dataset(sample, tokenizer):
   sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
   return sample
+
 # empty list to save remainder from batches to use in next batch
 remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}
+
 def chunk(sample, chunk_length=2048):
   # define global remainder variable to save remainder from batches to use in next batch
   global remainder
@@ -78,6 +84,7 @@ def chunk(sample, chunk_length=2048):
   # prepare labels
   result["labels"] = result["input_ids"].copy()
   return result
+
 def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
   # Load dataset from the hub
   dataset = load_dataset(dataset_name, split="train")
@@ -96,6 +103,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
   # Print total number of samples
   print(f"Total number of samples: {len(lm_dataset)}")
   return lm_dataset
+
 def prepare_for_int4_training(model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True,
                               ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
   from peft.tuners.lora import LoraLayer
@@ -130,6 +138,7 @@ def prepare_for_int4_training(model_id: str, model_version: str | None = None, g
         if bf16 and module.weight.dtype == torch.float32:
           module = module.to(torch.bfloat16)
   return model, tokenizer
+
 @dataclasses.dataclass
 class TrainingArguments:
   per_device_train_batch_size: int = dataclasses.field(default=1)
@@ -141,12 +150,14 @@ class TrainingArguments:
   report_to: str = dataclasses.field(default="none")
   output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "llama"))
   save_strategy: str = dataclasses.field(default="no")
+
 @dataclasses.dataclass
 class ModelArguments:
   model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
   model_version: str = dataclasses.field(default=DEFAULT_MODEL_VERSION)
   seed: int = dataclasses.field(default=42)
   merge_weights: bool = dataclasses.field(default=False)
+
 if openllm.utils.in_notebook():
   model_args, training_rags = ModelArguments(), TrainingArguments()
 else:
@@ -160,6 +171,7 @@ else:
 
 # import the model first hand
 openllm.import_model("llama", model_id=model_args.model_id, model_version=model_args.model_version)
+
 def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
   import peft
 
@@ -194,4 +206,5 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
     model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB")
   else:
     trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))
+
 train_loop(model_args, training_args)
diff --git a/openllm-python/src/openllm/playground/opt_tuned.py b/openllm-python/src/openllm/playground/opt_tuned.py
index 6f04fd05..2043b65e 100644
--- a/openllm-python/src/openllm/playground/opt_tuned.py
+++ b/openllm-python/src/openllm/playground/opt_tuned.py
@@ -24,6 +24,7 @@ from datasets import load_dataset
 if t.TYPE_CHECKING:
   from peft import PeftModel
 DEFAULT_MODEL_ID = "facebook/opt-6.7b"
+
 def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments):
   return transformers.Trainer(
       model=model,
@@ -31,6 +32,7 @@ def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, da
       args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
       data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
   )
+
 @dataclasses.dataclass
 class TrainingArguments:
   per_device_train_batch_size: int = dataclasses.field(default=4)
@@ -41,9 +43,11 @@ class TrainingArguments:
   fp16: bool = dataclasses.field(default=True)
   logging_steps: int = dataclasses.field(default=1)
   output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), "outputs", "opt"))
+
 @dataclasses.dataclass
 class ModelArguments:
   model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
+
 parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
 if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
   # If we pass only one argument to the script and it's the path to a json file,
diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py
index 5c3c677b..e75c5e6b 100644
--- a/openllm-python/src/openllm/serialisation/__init__.py
+++ b/openllm-python/src/openllm/serialisation/__init__.py
@@ -37,6 +37,7 @@ if t.TYPE_CHECKING:
 
   from . import constants as constants, ggml as ggml, transformers as transformers
 P = ParamSpec('P')
+
 def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
   '''Load the tokenizer from BentoML store.
 
@@ -66,10 +67,13 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
     elif tokenizer.eos_token_id is not None: tokenizer.pad_token_id = tokenizer.eos_token_id
     else: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
   return tokenizer
+
 class _Caller(t.Protocol[P]):
   def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
     ...
+
 _extras = ['get', 'import_model', 'save_pretrained', 'load_model']
+
 def _make_dispatch_function(fn: str) -> _Caller[P]:
   def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
     """Generic function dispatch to correct serialisation submodules based on LLM runtime.
@@ -81,6 +85,7 @@ def _make_dispatch_function(fn: str) -> _Caller[P]:
     return getattr(importlib.import_module(f'.{llm.runtime}', __name__), fn)(llm, *args, **kwargs)
 
   return caller
+
 if t.TYPE_CHECKING:
 
   def get(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model:
@@ -94,10 +99,13 @@ if t.TYPE_CHECKING:
 
   def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M:
     ...
+
 _import_structure: dict[str, list[str]] = {'ggml': [], 'transformers': [], 'constants': []}
 __all__ = ['ggml', 'transformers', 'constants', 'load_tokenizer', *_extras]
+
 def __dir__() -> list[str]:
   return sorted(__all__)
+
 def __getattr__(name: str) -> t.Any:
   if name == 'load_tokenizer': return load_tokenizer
   elif name in _import_structure: return importlib.import_module(f'.{name}', __name__)
diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py
index 5f2244d1..24d961f0 100644
--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -10,8 +10,10 @@ import openllm
 if t.TYPE_CHECKING: from openllm_core._typing_compat import M
 
 _conversion_strategy = {'pt': 'ggml'}
+
 def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model:
   raise NotImplementedError('Currently work in progress.')
+
 def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
   '''Return an instance of ``bentoml.Model`` from given LLM instance.
 
@@ -31,7 +33,9 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
     if auto_import:
       return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
     raise
+
 def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M:
   raise NotImplementedError('Currently work in progress.')
+
 def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None:
   raise NotImplementedError('Currently work in progress.')
diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py
index fc71c5fb..29acfe92 100644
--- a/openllm-python/src/openllm/serialisation/transformers/__init__.py
+++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py
@@ -34,6 +34,7 @@ else:
 logger = logging.getLogger(__name__)
 
 __all__ = ['import_model', 'get', 'load_model', 'save_pretrained']
+
 @inject
 def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model:
   """Auto detect model type from given model_id and import it to bentoml's model store.
@@ -136,6 +137,7 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
       # in the case where users first run openllm start without the model available locally.
       if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
     return bentomodel
+
 def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
   '''Return an instance of ``bentoml.Model`` from given LLM instance.
 
@@ -157,6 +159,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
   except bentoml.exceptions.NotFound as err:
     if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
     raise err from None
+
 def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
   '''Load the model from BentoML store.
 
@@ -189,6 +192,7 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
   if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
   if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
   return t.cast('M', model)
+
 def save_pretrained(
     llm: openllm.LLM[M, T],
     save_directory: str,
diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
index 0a8c3089..243d837f 100644
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -18,6 +18,7 @@ else:
   transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
 
 _object_setattr = object.__setattr__
+
 def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
   '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.
 
@@ -37,10 +38,12 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
     if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype')
     config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs)
   return config, hub_attrs, attrs
+
 def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
   __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
   if __cls is None: raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
   return __cls
+
 def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass:
   if llm.config['trust_remote_code']:
     autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
@@ -55,9 +58,11 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra
     elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1
     else: raise openllm.exceptions.OpenLLMException(f'Model type {type(config)} is not supported yet.')
     return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx])
+
 def check_unintialised_params(model: torch.nn.Module) -> None:
   unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
   if len(unintialized) > 0: raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
+
 def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
   based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
   based.update(metadata)
@@ -65,6 +70,7 @@ def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Mod
       tag=bentomodel.info.tag, module=bentomodel.info.module, labels=bentomodel.info.labels, options=bentomodel.info.options.to_dict(), signatures=bentomodel.info.signatures, context=bentomodel.info.context, api_version=bentomodel.info.api_version, creation_time=bentomodel.info.creation_time, metadata=based
   ))
   return bentomodel
+
 # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
 def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
   infer_fn: tuple[str, ...] = ('__call__',)
diff --git a/openllm-python/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py
index 882de1c5..4743d8c0 100644
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -6,8 +6,10 @@ from huggingface_hub import HfApi
 if t.TYPE_CHECKING:
   import openllm
   from openllm_core._typing_compat import M, T
+
 def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool:
   return any(s.rfilename.endswith('.safetensors') for s in HfApi().model_info(model_id, revision=revision).siblings)
+
 @attr.define(slots=True)
 class HfIgnore:
   safetensors = '*.safetensors'
diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py
index 1ff88a86..fc973e3c 100644
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -11,6 +11,7 @@ import openllm
 if t.TYPE_CHECKING: from ._typing_compat import LiteralRuntime
 
 logger = logging.getLogger(__name__)
+
 @contextlib.contextmanager
 def build_bento(
     model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, runtime: t.Literal['ggml', 'transformers'] = 'transformers', cleanup: bool = False
@@ -21,6 +22,7 @@ def build_bento(
   if cleanup:
     logger.info('Deleting %s', bento.tag)
     bentoml.bentos.delete(bento.tag)
+
 @contextlib.contextmanager
 def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]:
   if isinstance(bento, bentoml.Bento): bento_tag = bento.tag
@@ -36,6 +38,7 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | N
     if cleanup:
       logger.info('Deleting container %s', image_tag)
       subprocess.check_output([executable, 'rmi', '-f', image_tag])
+
 @contextlib.contextmanager
 def prepare(
     model: str,
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index 75eccf9c..637f3e98 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -8,17 +8,14 @@ import typing as t
 
 import openllm_core
 
-from . import (
-  dummy_flax_objects as dummy_flax_objects,
-  dummy_pt_objects as dummy_pt_objects,
-  dummy_tf_objects as dummy_tf_objects,
-  dummy_vllm_objects as dummy_vllm_objects,
-)
+from . import dummy_flax_objects as dummy_flax_objects, dummy_pt_objects as dummy_pt_objects, dummy_tf_objects as dummy_tf_objects, dummy_vllm_objects as dummy_vllm_objects
 if t.TYPE_CHECKING:
   import openllm
   from openllm_core._typing_compat import LiteralRuntime
+
 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
   return {'runtime': llm.runtime, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation_format': llm._serialisation_format}
+
 def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
   import openllm
   if implementation == 'tf': return openllm.AutoTFLLM
@@ -26,9 +23,12 @@ def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | o
   elif implementation == 'pt': return openllm.AutoLLM
   elif implementation == 'vllm': return openllm.AutoVLLM
   else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")
+
 __all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects']
+
 def __dir__() -> t.Sequence[str]:
   return sorted(__all__)
+
 def __getattr__(it: str) -> t.Any:
   if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it)
   else: raise AttributeError(f'module {__name__} has no attribute {it}')
diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py
index c2ea2e4d..28c27dac 100644
--- a/openllm-python/tests/_strategies/_configuration.py
+++ b/openllm-python/tests/_strategies/_configuration.py
@@ -9,6 +9,7 @@ from openllm_core._configuration import ModelSettings
 logger = logging.getLogger(__name__)
 
 env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_name in openllm.CONFIG_MAPPING.keys()])
+
 @st.composite
 def model_settings(draw: st.DrawFn):
   '''Strategy for generating ModelSettings objects.'''
@@ -28,6 +29,7 @@ def model_settings(draw: st.DrawFn):
       'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
   }
   return draw(st.builds(ModelSettings, **kwargs))
+
 def make_llm_config(
     cls_name: str,
     dunder_config: dict[str, t.Any] | ModelSettings,
diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py
index a0f78fdf..c6080adb 100644
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -14,6 +14,7 @@ import openllm
 from openllm_core._configuration import GenerationConfig, ModelSettings, field_env_key
 
 from ._strategies._configuration import make_llm_config, model_settings
+
 # XXX: @aarnphm fixes TypedDict behaviour in 3.11
 @pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this')
 def test_missing_default():
@@ -23,6 +24,7 @@ def test_missing_default():
     make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
   with pytest.raises(ValueError, match='Missing required fields *'):
     make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},)
+
 def test_forbidden_access():
   cl_ = make_llm_config(
       'ForbiddenAccess', {
@@ -34,6 +36,7 @@ def test_forbidden_access():
   assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), 'GenerationConfig',)
   assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), 'SamplingParams',)
   assert openllm.utils.lenient_issubclass(cl_.__openllm_generation_class__, GenerationConfig)
+
 @given(model_settings())
 def test_class_normal_gen(gen_settings: ModelSettings):
   assume(gen_settings['default_id'] and all(i for i in gen_settings['model_ids']))
@@ -41,19 +44,23 @@ def test_class_normal_gen(gen_settings: ModelSettings):
   assert issubclass(cl_, openllm.LLMConfig)
   for key in gen_settings:
     assert object.__getattribute__(cl_, f'__openllm_{key}__') == gen_settings.__getitem__(key)
+
 @given(model_settings(), st.integers())
 def test_simple_struct_dump(gen_settings: ModelSettings, field1: int):
   cl_ = make_llm_config('IdempotentLLM', gen_settings, fields=(('field1', 'float', field1),))
   assert cl_().model_dump()['field1'] == field1
+
 @given(model_settings(), st.integers())
 def test_config_derivation(gen_settings: ModelSettings, field1: int):
   cl_ = make_llm_config('IdempotentLLM', gen_settings, fields=(('field1', 'float', field1),))
   new_cls = cl_.model_derivate('DerivedLLM', default_id='asdfasdf')
   assert new_cls.__openllm_default_id__ == 'asdfasdf'
+
 @given(model_settings())
 def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
   cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
   assert attr.has(cl_)
+
 @given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),)
 def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
   cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
@@ -72,10 +79,12 @@ def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperatu
   pas_nested = cl_(generation_config={'temperature': input_temperature}, field1=input_field1)
   assert pas_nested.model_dump()['field1'] == input_field1
   assert pas_nested.model_dump()['generation_config']['temperature'] == input_temperature
+
 @contextlib.contextmanager
 def patch_env(**attrs: t.Any):
   with mock.patch.dict(os.environ, attrs, clear=True):
     yield
+
 def test_struct_envvar():
   with patch_env(**{field_env_key('env_llm', 'field1'): '4', field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',}):
 
@@ -93,6 +102,7 @@ def test_struct_envvar():
     overwrite_default = EnvLLM()
     assert overwrite_default.field1 == 4
     assert overwrite_default['temperature'] == 0.2
+
 def test_struct_provided_fields():
   class EnvLLM(openllm.LLMConfig):
     __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
@@ -104,6 +114,7 @@ def test_struct_provided_fields():
   sent = EnvLLM.model_construct_env(field1=20, temperature=0.4)
   assert sent.field1 == 20
   assert sent.generation_config.temperature == 0.4
+
 def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mk:
     mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
@@ -115,11 +126,13 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
     ).model_construct_env(field1=20.0, temperature=0.4)
     assert sent.generation_config.temperature == 0.4
     assert sent.field1 == 20.0
+
 @given(model_settings())
 @pytest.mark.parametrize(('return_dict', 'typ'), [(True, dict), (False, transformers.GenerationConfig)])
 def test_conversion_to_transformers(return_dict: bool, typ: type[t.Any], gen_settings: ModelSettings):
   cl_ = make_llm_config('ConversionLLM', gen_settings)
   assert isinstance(cl_().to_generation_config(return_as_dict=return_dict), typ)
+
 @given(model_settings())
 def test_click_conversion(gen_settings: ModelSettings):
   # currently our conversion omit Union type.
@@ -131,6 +144,7 @@ def test_click_conversion(gen_settings: ModelSettings):
   filtered = {k for k, v in cl_.__openllm_hints__.items() if t.get_origin(v) is not t.Union}
   click_options_filtered = [i for i in wrapped.__click_params__ if i.name and not i.name.startswith('fake_')]
   assert len(filtered) == len(click_options_filtered)
+
 @pytest.mark.parametrize('model_name', openllm.CONFIG_MAPPING.keys())
 def test_configuration_dict_protocol(model_name: str):
   config = openllm.AutoConfig.for_model(model_name)
diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py
index be9b812f..02655e11 100644
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -10,6 +10,7 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralRuntime
 
 _FRAMEWORK_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',}
 _PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',}
+
 def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
   if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
   runtime_impl: tuple[LiteralRuntime, ...] = tuple()
@@ -19,10 +20,12 @@ def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunn
   for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
     llm = openllm.Runner(model, model_id=_FRAMEWORK_MAPPING[model], ensure_available=True, implementation=framework, init_local=True,)
     yield prompt, llm
+
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
   if os.getenv('GITHUB_ACTIONS') is None:
     if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
       metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
+
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
   # If no tests are collected, pytest exists with code 5, which makes the CI fail.
   if exitstatus == 5: session.exitstatus = 0
diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py
index 47e1a40e..e63a255e 100644
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -29,6 +29,7 @@ if t.TYPE_CHECKING:
 
   from openllm._configuration import GenerationConfig
   from openllm.client import BaseAsyncClient
+
 class ResponseComparator(JSONSnapshotExtension):
   def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData:
     if openllm.utils.LazyType(ListAny).isinstance(data):
@@ -66,9 +67,11 @@ class ResponseComparator(JSONSnapshotExtension):
       return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config))
 
     return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
+
 @pytest.fixture()
 def response_snapshot(snapshot: SnapshotAssertion):
   return snapshot.use_extension(ResponseComparator)
+
 @attr.define(init=False)
 class _Handle(ABC):
   port: int
@@ -100,6 +103,7 @@ class _Handle(ABC):
       except Exception:
         time.sleep(1)
     raise RuntimeError(f'Handle failed to initialise within {timeout} seconds.')
+
 @attr.define(init=False)
 class LocalHandle(_Handle):
   process: subprocess.Popen[bytes]
@@ -109,10 +113,12 @@ class LocalHandle(_Handle):
 
   def status(self) -> bool:
     return self.process.poll() is None
+
 class HandleProtocol(t.Protocol):
   @contextlib.contextmanager
   def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]:
     ...
+
 @attr.define(init=False)
 class DockerHandle(_Handle):
   container_name: str
@@ -124,6 +130,7 @@ class DockerHandle(_Handle):
   def status(self) -> bool:
     container = self.docker_client.containers.get(self.container_name)
     return container.status in ['running', 'created']
+
 @contextlib.contextmanager
 def _local_handle(
     model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False,
@@ -146,6 +153,7 @@ def _local_handle(
   proc.stdout.close()
   if proc.stderr:
     proc.stderr.close()
+
 @contextlib.contextmanager
 def _container_handle(
     model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False,
@@ -192,19 +200,23 @@ def _container_handle(
   print(container_output, file=sys.stderr)
 
   container.remove()
+
 @pytest.fixture(scope='session', autouse=True)
 def clean_context() -> t.Generator[contextlib.ExitStack, None, None]:
   stack = contextlib.ExitStack()
   yield stack
   stack.close()
+
 @pytest.fixture(scope='module')
 def el() -> t.Generator[asyncio.AbstractEventLoop, None, None]:
   loop = asyncio.get_event_loop()
   yield loop
   loop.close()
+
 @pytest.fixture(params=['container', 'local'], scope='session')
 def deployment_mode(request: pytest.FixtureRequest) -> str:
   return request.param
+
 @pytest.fixture(scope='module')
 def handler(el: asyncio.AbstractEventLoop, deployment_mode: t.Literal['container', 'local']):
   if deployment_mode == 'container':
diff --git a/openllm-python/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py
index fd3c6d22..85fe83f8 100644
--- a/openllm-python/tests/models/flan_t5_test.py
+++ b/openllm-python/tests/models/flan_t5_test.py
@@ -10,15 +10,18 @@ if t.TYPE_CHECKING:
   from .conftest import HandleProtocol, ResponseComparator, _Handle
 model = 'flan_t5'
 model_id = 'google/flan-t5-small'
+
 @pytest.fixture(scope='module')
 def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
   with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
     with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
       yield handle
+
 @pytest.fixture(scope='module')
 async def flan_t5(flan_t5_handle: _Handle):
   await flan_t5_handle.health(240)
   return flan_t5_handle.client
+
 @pytest.mark.asyncio()
 async def test_flan_t5(flan_t5: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator):
   client = await flan_t5
diff --git a/openllm-python/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py
index 3be257b4..b6db0798 100644
--- a/openllm-python/tests/models/opt_test.py
+++ b/openllm-python/tests/models/opt_test.py
@@ -10,15 +10,18 @@ if t.TYPE_CHECKING:
   from .conftest import HandleProtocol, ResponseComparator, _Handle
 model = 'opt'
 model_id = 'facebook/opt-125m'
+
 @pytest.fixture(scope='module')
 def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
   with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
     with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
       yield handle
+
 @pytest.fixture(scope='module')
 async def opt_125m(opt_125m_handle: _Handle):
   await opt_125m_handle.health(240)
   return opt_125m_handle.client
+
 @pytest.mark.asyncio()
 async def test_opt_125m(opt_125m: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator):
   client = await opt_125m
diff --git a/openllm-python/tests/models_test.py b/openllm-python/tests/models_test.py
index 7ffd56e4..9d7b7800 100644
--- a/openllm-python/tests/models_test.py
+++ b/openllm-python/tests/models_test.py
@@ -4,16 +4,19 @@ import typing as t
 
 import pytest
 if t.TYPE_CHECKING: import openllm
+
 @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='Model is too large for CI')
 def test_flan_t5_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]):
   assert llm(prompt)
 
   assert llm(prompt, temperature=0.8, top_p=0.23)
+
 @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='Model is too large for CI')
 def test_opt_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]):
   assert llm(prompt)
 
   assert llm(prompt, temperature=0.9, top_k=8)
+
 @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='Model is too large for CI')
 def test_baichuan_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]):
   assert llm(prompt)
diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py
index 4f16dd4b..291fa3b2 100644
--- a/openllm-python/tests/package_test.py
+++ b/openllm-python/tests/package_test.py
@@ -14,6 +14,7 @@ HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5'
 actions_xfail = functools.partial(
     pytest.mark.xfail, condition=os.getenv('GITHUB_ACTIONS') is not None, reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
 )
+
 @actions_xfail
 def test_general_build_with_internal_testing():
   bento_store = BentoMLContainer.bento_store.get()
@@ -26,6 +27,7 @@ def test_general_build_with_internal_testing():
 
   bento = openllm.build('flan-t5', model_id=HF_INTERNAL_T5_TESTING)
   assert len(bento_store.list(bento.tag)) == 1
+
 @actions_xfail
 def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
   local_path = tmp_path_factory.mktemp('local_t5')
@@ -37,11 +39,13 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
   llm.save_pretrained(local_path)
 
   assert openllm.build('flan-t5', model_id=local_path.resolve().__fspath__(), model_version='local')
+
 @pytest.fixture()
 def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
   file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
   file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
   return file
+
 @pytest.mark.usefixtures('dockerfile_template')
 @actions_xfail
 def test_build_with_custom_dockerfile(dockerfile_template: Path):
diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py
index da2d34c7..b0d40761 100644
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -8,6 +8,7 @@ import bentoml
 from openllm_core import _strategies as strategy
 from openllm_core._strategies import CascadingResourceStrategy, NvidiaGpuResource, get_resource
 if t.TYPE_CHECKING: from _pytest.monkeypatch import MonkeyPatch
+
 def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
     mcls.setenv('CUDA_VISIBLE_DEVICES', '0,1')
@@ -15,6 +16,7 @@ def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch):
     assert len(resource) == 2
     assert resource == ['0', '1']
     mcls.delenv('CUDA_VISIBLE_DEVICES')
+
 def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
     mcls.setenv('CUDA_VISIBLE_DEVICES', '0,2,-1,1')
@@ -22,6 +24,7 @@ def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch):
     assert len(resource) == 2
     assert resource == ['0', '2']
     mcls.delenv('CUDA_VISIBLE_DEVICES')
+
 def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
     mcls.setenv('CUDA_VISIBLE_DEVICES', '-1')
@@ -29,6 +32,7 @@ def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch):
     assert len(resource) == 0
     assert resource == []
     mcls.delenv('CUDA_VISIBLE_DEVICES')
+
 def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
     mcls.setenv('CUDA_VISIBLE_DEVICES', 'GPU-5ebe9f43-ac33420d4628')
@@ -54,6 +58,7 @@ def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch):
     assert len(resource) == 1
     assert resource == ['MIG-GPU-5ebe9f43-ac33420d4628']
     mcls.delenv('CUDA_VISIBLE_DEVICES')
+
 @pytest.mark.skipif(os.getenv('GITHUB_ACTIONS') is not None, reason='skip GPUs test on CI')
 def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
@@ -64,6 +69,7 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
     assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.')
     assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
     assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
+
 def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
   with monkeypatch.context() as mcls:
     # to make this tests works with system that has GPU
@@ -90,10 +96,13 @@ def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
     NvidiaGpuResource.from_spec(1.5)
   with pytest.raises(ValueError):
     assert NvidiaGpuResource.from_spec(-2)
+
 class GPURunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu')
+
 def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False):
   return get_resource(x, y, validate=validate)
+
 @pytest.mark.parametrize('gpu_type', ['nvidia.com/gpu', 'amd.com/gpu'])
 def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str):
   monkeypatch.setattr(strategy, 'get_resource', unvalidated_get_resource)
@@ -104,6 +113,7 @@ def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str):
   assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1
   assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1
   assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1
+
 @pytest.mark.parametrize('gpu_type', ['nvidia.com/gpu', 'amd.com/gpu'])
 def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):
   monkeypatch.setattr(strategy, 'get_resource', unvalidated_get_resource)
@@ -142,6 +152,7 @@ def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str):
   assert envs.get('CUDA_VISIBLE_DEVICES') == '7,8'
   envs = CascadingResourceStrategy.get_worker_env(GPURunnable, {gpu_type: [2, 6, 7, 8, 9]}, 0.4, 2)
   assert envs.get('CUDA_VISIBLE_DEVICES') == '9'
+
 @pytest.mark.parametrize('gpu_type', ['nvidia.com/gpu', 'amd.com/gpu'])
 def test_cascade_strategy_disabled_via_env(monkeypatch: MonkeyPatch, gpu_type: str):
   monkeypatch.setattr(strategy, 'get_resource', unvalidated_get_resource)
diff --git a/pyproject.toml b/pyproject.toml
index 94940909..d7f2d881 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -265,14 +265,14 @@ ALLOW_MULTILINE_LAMBDAS = false
 ALLOW_SPLIT_BEFORE_DEFAULT_OR_NAMED_ASSIGNS = false
 ALLOW_SPLIT_BEFORE_DICT_VALUE = false
 ARITHMETIC_PRECEDENCE_INDICATION = true
-BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 0
+BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 0
 BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
 BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
 BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
 COALESCE_BRACKETS = true
 COLUMN_LIMIT = 192
-CONTINUATION_ALIGN_STYLE = "VALIGN-RIGHT"
+CONTINUATION_ALIGN_STYLE = "SPACE"
 DEDENT_CLOSING_BRACKETS = true
 DISABLE_ENDING_COMMA_HEURISTIC = true
 EACH_DICT_ENTRY_ON_SEPARATE_LINE = true
diff --git a/tools/dependencies.py b/tools/dependencies.py
index af0470b9..3e895f39 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -9,6 +9,7 @@ sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src'))
 
 import openllm
 _OWNER, _REPO = 'bentoml', 'openllm'
+
 @dataclasses.dataclass(frozen=True)
 class Classifier:
   identifier: t.Dict[str, str] = dataclasses.field(
@@ -53,6 +54,7 @@ class Classifier:
   @staticmethod
   def create_status_classifier(level: int) -> str:
     return Classifier.create_classifier('status', Classifier.status()[level])
+
 @dataclasses.dataclass(frozen=True)
 class Dependencies:
   name: str
@@ -95,6 +97,7 @@ class Dependencies:
   @classmethod
   def from_tuple(cls, *decls: t.Any) -> Dependencies:
     return cls(*decls)
+
 lower_bentoml_constraint = '1.1.2'
 _BENTOML_EXT = ['io']
 _TRANSFORMERS_EXT = ['torch', 'tokenizers', 'accelerate']
@@ -138,8 +141,10 @@ _base_requirements.update({v: _locals.get(f'{inflection.underscore(v).upper()}_D
 _base_requirements = {k: v for k, v in sorted(_base_requirements.items())}
 
 fname = f'{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}'
+
 def correct_style(it: t.Any) -> t.Any:
   return it
+
 def create_classifiers() -> Array:
   arr = correct_style(tomlkit.array())
   arr.extend([
@@ -159,6 +164,7 @@ def create_classifiers() -> Array:
       *Classifier.create_python_classifier(),
   ])
   return arr.multiline(True)
+
 def create_optional_table() -> Table:
   all_array = tomlkit.array()
   all_array.append(f"openllm[{','.join(_base_requirements)}]")
@@ -169,6 +175,7 @@ def create_optional_table() -> Table:
   table.add(tomlkit.nl())
 
   return table
+
 def create_url_table(_info: t.Any) -> Table:
   table = tomlkit.table()
   _urls = {
@@ -183,6 +190,7 @@ def create_url_table(_info: t.Any) -> Table:
   }
   table.update({k: v for k, v in sorted(_urls.items())})
   return table
+
 def build_system() -> Table:
   table = tomlkit.table()
   table.add('build-backend', 'hatchling.build')
@@ -190,11 +198,13 @@ def build_system() -> Table:
   requires_array.extend(['hatchling==1.18.0', 'hatch-vcs==0.3.0', 'hatch-fancy-pypi-readme==23.1.0'])
   table.add('requires', requires_array.multiline(True))
   return table
+
 def authors() -> Array:
   arr = correct_style(tomlkit.array())
   arr.append(dict(name='Aaron Pham', email='aarnphm@bentoml.com'))
   arr.append(dict(name='BentoML Team', email='contact@bentoml.com'))
   return arr.multiline(True)
+
 def keywords() -> Array:
   arr = correct_style(tomlkit.array())
   arr.extend([
@@ -217,6 +227,7 @@ def keywords() -> Array:
       'Transformers'
   ])
   return arr.multiline(True)
+
 def build_cli_extensions() -> Table:
   table = tomlkit.table()
   ext: dict[str, str] = {'openllm': 'openllm.cli.entrypoint:cli'}
@@ -228,6 +239,7 @@ def build_cli_extensions() -> Table:
   })
   table.update(ext)
   return table
+
 def main() -> int:
   api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
   _info = api.repos.get()
@@ -258,4 +270,5 @@ def main() -> int:
   with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'w') as f:
     f.write(tomlkit.dumps(pyproject))
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/generate-coverage.py b/tools/generate-coverage.py
index d3e845c4..4bf3d9ed 100755
--- a/tools/generate-coverage.py
+++ b/tools/generate-coverage.py
@@ -8,6 +8,7 @@ from lxml import etree
 ROOT = Path(__file__).resolve().parent.parent
 
 PACKAGES = {'openllm-python/src/openllm/': 'openllm'}
+
 def main() -> int:
   coverage_report = ROOT / 'coverage.xml'
   root = etree.fromstring(coverage_report.read_text())
@@ -42,4 +43,5 @@ def main() -> int:
   coverage_summary = ROOT / 'coverage-summary.json'
   coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8')
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-brew-tap.py b/tools/update-brew-tap.py
index fc94a3a1..2e5c9756 100755
--- a/tools/update-brew-tap.py
+++ b/tools/update-brew-tap.py
@@ -18,12 +18,15 @@ _REPO = 'openllm'
 _gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] = {
     'macos_arm': 'aarch64-apple-darwin', 'macos_intel': 'x86_64-apple-darwin', 'linux_intel': 'x86_64-unknown-linux-musl'
 }
+
 def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str:
   if target == 'archive': return f'{svn_url}/archive/{tag}.tar.gz'
   return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz"
+
 # curl -sSL <svn_url>/archive/refs/tags/<tag>.tar.gz | shasum -a256 | cut -d'' -f1
 def get_release_hash_command(svn_url: str, tag: str) -> Pipeline:
   return curl['-sSL', svn_url] | shasum['-a256'] | cut['-d', ' ', '-f1']
+
 def main() -> int:
   api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
   _info = api.repos.get()
@@ -54,4 +57,5 @@ def main() -> int:
     )
     f.write('\n')
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index bc849797..5f57797c 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -17,10 +17,12 @@ _TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.
 sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
 from openllm_core._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
 from openllm_core.utils import codegen
+
 def process_annotations(annotations: str) -> str:
   if 'NotRequired' in annotations: return annotations[len('NotRequired['):-1]
   elif 'Required' in annotations: return annotations[len('Required['):-1]
   else: return annotations
+
 _value_docstring = {
     'default_id': '''Return the default model to use when using 'openllm start <model_id>'.
         This could be one of the keys in 'self.model_ids' or custom users model.
@@ -81,6 +83,7 @@ _value_docstring = {
 }
 
 _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}
+
 def main() -> int:
   with _TARGET_FILE.open('r') as f:
     processed = f.readlines()
@@ -135,4 +138,5 @@ def main() -> int:
   with _TARGET_FILE.open('w') as f:
     f.writelines(processed)
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-dummy.py b/tools/update-dummy.py
index b610812e..20ddf0af 100755
--- a/tools/update-dummy.py
+++ b/tools/update-dummy.py
@@ -15,12 +15,16 @@ if t.TYPE_CHECKING: from collections import OrderedDict
 config_requirements = {k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()}
 _dependencies: dict[LiteralRuntime, str] = {k: v for k, v in zip(LiteralRuntime.__args__, ('torch', 'tensorflow', 'flax', 'vllm'))}
 _auto: dict[str, str] = {k: v for k, v in zip(LiteralRuntime.__args__, ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))}
+
 def get_target_dummy_file(framework: LiteralRuntime) -> Path:
   return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{framework}_objects.py'
+
 def mapping_names(framework: LiteralRuntime):
   return 'MODEL_MAPPING_NAMES' if framework == 'pt' else f'MODEL_{framework.upper()}_MAPPING_NAMES'
+
 def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]:
   return getattr(auto, mapping_names(framework))
+
 def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int = 2, auto: bool = False) -> list[str]:
   _dep_list: list[str] = [
       f'"{v}"' for v in [_dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])]
@@ -33,6 +37,7 @@ def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int
       ' '*indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])"
   ]
   return lines
+
 def write_stub(framework: LiteralRuntime, _path: str) -> list[str]:
   base = [
       f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!',
@@ -48,10 +53,12 @@ def write_stub(framework: LiteralRuntime, _path: str) -> list[str]:
   _imports = [f'"{v}"' for v in get_mapping(framework).values()]
   base += [f'{mapping_names(framework)}:_t.Any=None', f"__all__:list[str]=[\"{mapping_names(framework)}\",\"{_auto[framework]}\",{','.join(_imports)}]\n"]
   return base
+
 def main() -> int:
   _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
   for framework in _dependencies:
     with get_target_dummy_file(framework).open('w') as f:
       f.write('\n'.join(write_stub(framework, _path)))
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-models-import.py b/tools/update-models-import.py
index 098e3147..97f8b2b7 100755
--- a/tools/update-models-import.py
+++ b/tools/update-models-import.py
@@ -3,9 +3,11 @@ from __future__ import annotations
 import os
 from pathlib import Path
 _TARGET_FILE = Path(__file__).parent.parent / 'openllm-python' / 'src' / 'openllm' / 'models' / '__init__.py'
+
 def create_module_import() -> str:
   r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']]
   return f"_MODELS:set[str]={{{', '.join(sorted(r))}}}"
+
 def create_stubs_import() -> list[str]:
   return [
       'if t.TYPE_CHECKING:from . import ' + ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]),
@@ -14,6 +16,7 @@ def create_stubs_import() -> list[str]:
       '__dir__=__lazy.__dir__',
       '__getattr__=__lazy.__getattr__\n'
   ]
+
 def main() -> int:
   _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
   with _TARGET_FILE.open('w') as f:
@@ -29,4 +32,5 @@ def main() -> int:
         ])
     )
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-readme.py b/tools/update-readme.py
index 398002f7..38d92278 100755
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -8,6 +8,7 @@ END_COMMENT = f'<!-- {os.path.basename(__file__)}: stop -->\n'
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src'))
 import openllm
+
 def main() -> int:
   with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
     deps = tomlkit.parse(f.read()).value['project']['optional-dependencies']
@@ -56,4 +57,5 @@ def main() -> int:
   with open(os.path.join(ROOT, 'README.md'), 'w') as f:
     f.writelines(readme)
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/write-coverage-report.py b/tools/write-coverage-report.py
index 53edac46..f24cd9c1 100755
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -6,6 +6,7 @@ import orjson
 PRECISION = Decimal('.01')
 
 ROOT = Path(__file__).resolve().parent.parent
+
 def main() -> int:
   coverage_summary = ROOT / 'coverage-summary.json'
 
@@ -35,4 +36,5 @@ def main() -> int:
   with coverage_report.open('w', encoding='utf-8') as f:
     f.write(''.join(lines))
   return 0
+
 if __name__ == '__main__': raise SystemExit(main())