diff --git a/changelog.d/249.refactor.md b/changelog.d/249.refactor.md
index d8bfd324..209d9115 100644
--- a/changelog.d/249.refactor.md
+++ b/changelog.d/249.refactor.md
@@ -3,3 +3,6 @@ OpenLLM now comprise of three packages:
 - `openllm-core`: main building blocks of OpenLLM, that doesn't depend on transformers and heavy DL libraries
 - `openllm-client`: The implementation of `openllm.client`
 - `openllm`: = `openllm-core` + `openllm-client` + DL features (under `openllm-python`)
+
+OpenLLM now will provide `start-grpc` as opt-in. If you wan to use `openllm start-grpc`, make sure to install
+with `pip install "openllm[grpc]"`
diff --git a/hatch.toml b/hatch.toml
index 875a7140..32ffac03 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -32,7 +32,6 @@ inplace-changelog = "towncrier build --version main --keep"
 quality = [
     "./tools/dependencies.py",
     "./tools/update-readme.py",
-    "- ./tools/yapf",
     "- ./tools/update-brew-tap.py",
     "bash ./tools/sync-readme.sh",
     "check-stubs",
diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py
index 18b0e2a8..73b72f83 100644
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -5,14 +5,13 @@ from collections import OrderedDict
 import inflection, packaging.version
 from bentoml._internal.utils import LazyLoader, pkg
 from openllm_core._typing_compat import overload, LiteralString
-
 from .representation import ReprMixin
 
 if t.TYPE_CHECKING:
   BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
   from openllm_core._typing_compat import LiteralRuntime
 logger = logging.getLogger(__name__)
-OPTIONAL_DEPENDENCIES = {'opt', 'flan-t5', 'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq'}
+OPTIONAL_DEPENDENCIES = {'opt', 'flan-t5', 'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq', 'grpc'}
 ENV_VARS_TRUE_VALUES = {'1', 'ON', 'YES', 'TRUE'}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({'AUTO'})
 USE_TF = os.environ.get('USE_TF', 'AUTO').upper()
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 4e161c9d..838e9536 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "bentoml[grpc,io]>=1.1.2",
+  "bentoml[io]>=1.1.2",
   "transformers[torch,tokenizers,accelerate]>=4.29.0",
   "openllm-client",
   "safetensors",
@@ -102,10 +102,11 @@ falcon = ["einops", "xformers"]
 fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
 flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 full = [
-  "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,llama,mpt,openai,opt,playground,starcoder,vllm]",
+    "openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
 ]
 ggml = ["ctransformers"]
 gptq = ["auto-gptq[triton]"]
+grpc = ["openllm-client[grpc]"]
 llama = ["fairscale", "sentencepiece"]
 mpt = ["triton", "einops"]
 openai = ["openai", "tiktoken"]
diff --git a/tools/assert-model-table-latest.py b/tools/assert-model-table-latest.py
index 26fa16d1..8065e8cb 100755
--- a/tools/assert-model-table-latest.py
+++ b/tools/assert-model-table-latest.py
@@ -2,24 +2,24 @@
 from __future__ import annotations
 import os, sys
 from markdown_it import MarkdownIt
-
 md = MarkdownIt()
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-with open(os.path.join(ROOT, "README.md"), "r") as f: readme = md.parse(f.read())
-sys.path.insert(0, os.path.join(ROOT,"openllm-python","src"))
+with open(os.path.join(ROOT, 'README.md'), 'r') as f:
+  readme = md.parse(f.read())
+sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src'))
 import openllm
 
 # NOTE: Currently, we only have one table in README, which is the Model readme.
-table = [r for r in readme if r.type == "html_block" and r.content.startswith("<td><a")]
+table = [r for r in readme if r.type == 'html_block' and r.content.startswith('<td><a')]
 
-prev = os.environ.pop("OPENLLMDEVDEBUG", None)
+prev = os.environ.pop('OPENLLMDEVDEBUG', None)
 available = len(openllm.CONFIG_MAPPING.keys())
-if prev: os.environ["OPENLLMDEVDEBUG"] = prev
+if prev: os.environ['OPENLLMDEVDEBUG'] = prev
 
 on_table = len(table)  # NOTE: minus the header
 
 if available - on_table != 0:
-  print("README.md is out of date! Make sure to run ./tools/update-readme.py")
+  print('README.md is out of date! Make sure to run ./tools/update-readme.py')
   raise SystemExit(1)
 raise SystemExit(0)
diff --git a/tools/dependencies.py b/tools/dependencies.py
index dbffd404..af0470b9 100755
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -1,43 +1,58 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import dataclasses, os, typing as t, sys
-import inflection, tomlkit
+import dataclasses, os, typing as t, sys, inflection, tomlkit
 from ghapi.all import GhApi
 if t.TYPE_CHECKING: from tomlkit.items import Array, Table
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, os.path.join(ROOT, "openllm-python", "src"))
+sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src'))
 
 import openllm
-
-_OWNER, _REPO = "bentoml", "openllm"
-
+_OWNER, _REPO = 'bentoml', 'openllm'
 @dataclasses.dataclass(frozen=True)
 class Classifier:
-  identifier: t.Dict[str, str] = dataclasses.field(default_factory=lambda: {"status": "Development Status", "environment": "Environment", "license": "License", "topic": "Topic", "os": "Operating System", "audience": "Intended Audience", "typing": "Typing", "language": "Programming Language",})
-  joiner: str = " :: "
+  identifier: t.Dict[str, str] = dataclasses.field(
+      default_factory=lambda: {
+          'status': 'Development Status',
+          'environment': 'Environment',
+          'license': 'License',
+          'topic': 'Topic',
+          'os': 'Operating System',
+          'audience': 'Intended Audience',
+          'typing': 'Typing',
+          'language': 'Programming Language',
+      }
+  )
+  joiner: str = ' :: '
+
   @staticmethod
-  def status() -> dict[int, str]: return {v: status for v, status in zip(range(1, 8), ["1 - Planning", "2 - Pre-Alpha", "3 - Alpha", "4 - Beta", "5 - Production/Stable", "6 - Mature", "7 - Inactive",],)}
+  def status() -> dict[int, str]:
+    return {v: status for v, status in zip(range(1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive'])}
+
   @staticmethod
-  def apache() -> str: return Classifier.create_classifier("license", "OSI Approved", "Apache Software License")
+  def apache() -> str:
+    return Classifier.create_classifier('license', 'OSI Approved', 'Apache Software License')
+
   @staticmethod
   def create_classifier(identifier: str, *decls: t.Any) -> str:
     cls_ = Classifier()
     if identifier not in cls_.identifier:
-      raise ValueError(f"{identifier} is not yet supported (supported alias: {Classifier.identifier})")
+      raise ValueError(f'{identifier} is not yet supported (supported alias: {Classifier.identifier})')
     return cls_.joiner.join([cls_.identifier[identifier], *decls])
+
   @staticmethod
   def create_python_classifier(implementation: list[str] | None = None, supported_version: list[str] | None = None) -> list[str]:
-    if supported_version is None: supported_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
-    if implementation is None: implementation = ["CPython", "PyPy"]
-    base = [Classifier.create_classifier("language", "Python"), Classifier.create_classifier("language", "Python", "3"),]
-    base.append(Classifier.create_classifier("language", "Python", "3", "Only"))
-    base.extend([Classifier.create_classifier("language", "Python", version) for version in supported_version])
-    base.extend([Classifier.create_classifier("language", "Python", "Implementation", impl) for impl in implementation])
+    if supported_version is None: supported_version = ['3.8', '3.9', '3.10', '3.11', '3.12']
+    if implementation is None: implementation = ['CPython', 'PyPy']
+    base = [Classifier.create_classifier('language', 'Python'), Classifier.create_classifier('language', 'Python', '3'),]
+    base.append(Classifier.create_classifier('language', 'Python', '3', 'Only'))
+    base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version])
+    base.extend([Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation])
     return base
-  @staticmethod
-  def create_status_classifier(level: int) -> str: return Classifier.create_classifier("status", Classifier.status()[level])
 
+  @staticmethod
+  def create_status_classifier(level: int) -> str:
+    return Classifier.create_classifier('status', Classifier.status()[level])
 @dataclasses.dataclass(frozen=True)
 class Dependencies:
   name: str
@@ -48,181 +63,199 @@ class Dependencies:
   requires_gpu: bool = False
   lower_constraint: t.Optional[str] = None
   upper_constraint: t.Optional[str] = None
-  platform: t.Optional[t.Tuple[t.Literal["Linux", "Windows", "Darwin"], t.Literal["eq", "ne"]]] = None
-  def with_options(self, **kwargs: t.Any) -> Dependencies: return dataclasses.replace(self, **kwargs)
+  platform: t.Optional[t.Tuple[t.Literal['Linux', 'Windows', 'Darwin'], t.Literal['eq', 'ne']]] = None
+
+  def with_options(self, **kwargs: t.Any) -> Dependencies:
+    return dataclasses.replace(self, **kwargs)
+
   @property
-  def has_constraint(self) -> bool: return self.lower_constraint is not None or self.upper_constraint is not None
+  def has_constraint(self) -> bool:
+    return self.lower_constraint is not None or self.upper_constraint is not None
+
   @property
-  def pypi_extensions(self) -> str: return "" if self.extensions is None else f"[{','.join(self.extensions)}]"
+  def pypi_extensions(self) -> str:
+    return '' if self.extensions is None else f"[{','.join(self.extensions)}]"
+
   @staticmethod
-  def platform_restriction(platform: t.LiteralString, op: t.Literal["eq", "ne"] = "eq") -> str: return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
+  def platform_restriction(platform: t.LiteralString, op: t.Literal['eq', 'ne'] = 'eq') -> str:
+    return f'platform_system{"==" if op == "eq" else "!="}"{platform}"'
+
   def to_str(self) -> str:
     deps: list[str] = []
-    if self.lower_constraint is not None and self.upper_constraint is not None: dep = f"{self.name}{self.pypi_extensions}>={self.lower_constraint},<{self.upper_constraint}"
-    elif self.lower_constraint is not None: dep = f"{self.name}{self.pypi_extensions}>={self.lower_constraint}"
-    elif self.upper_constraint is not None: dep = f"{self.name}{self.pypi_extensions}<{self.upper_constraint}"
-    elif self.subdirectory is not None: dep = f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}"
-    elif self.branch is not None: dep = f"{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}"
-    else: dep = f"{self.name}{self.pypi_extensions}"
+    if self.lower_constraint is not None and self.upper_constraint is not None: dep = f'{self.name}{self.pypi_extensions}>={self.lower_constraint},<{self.upper_constraint}'
+    elif self.lower_constraint is not None: dep = f'{self.name}{self.pypi_extensions}>={self.lower_constraint}'
+    elif self.upper_constraint is not None: dep = f'{self.name}{self.pypi_extensions}<{self.upper_constraint}'
+    elif self.subdirectory is not None: dep = f'{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git#subdirectory={self.subdirectory}'
+    elif self.branch is not None: dep = f'{self.name}{self.pypi_extensions} @ git+https://github.com/{self.git_repo_url}.git@{self.branch}'
+    else: dep = f'{self.name}{self.pypi_extensions}'
     deps.append(dep)
     if self.platform: deps.append(self.platform_restriction(*self.platform))
-    return ";".join(deps)
-  @classmethod
-  def from_tuple(cls, *decls: t.Any) -> Dependencies: return cls(*decls)
+    return ';'.join(deps)
 
-_BENTOML_EXT = ["grpc", "io"]
-_TRANSFORMERS_EXT = ["torch", "tokenizers", "accelerate"]
+  @classmethod
+  def from_tuple(cls, *decls: t.Any) -> Dependencies:
+    return cls(*decls)
+lower_bentoml_constraint = '1.1.2'
+_BENTOML_EXT = ['io']
+_TRANSFORMERS_EXT = ['torch', 'tokenizers', 'accelerate']
 
 _BASE_DEPENDENCIES = [
-    Dependencies(name="bentoml", extensions=_BENTOML_EXT, lower_constraint="1.1.2"),
-    Dependencies(name="transformers", extensions=_TRANSFORMERS_EXT, lower_constraint="4.29.0"),
-    Dependencies(name="openllm-client"),
-    Dependencies(name="safetensors"),
-    Dependencies(name="optimum"),
-    Dependencies(name="ghapi"),
-    Dependencies(name="tabulate", extensions=["widechars"], lower_constraint="0.9.0"),
-    Dependencies(name="click", lower_constraint="8.1.3"),
-    Dependencies(name="cuda-python", platform=("Darwin", "ne")),
-    Dependencies(name="bitsandbytes", upper_constraint="0.42"),  # 0.41  works with CUDA 11.8
+    Dependencies(name='bentoml', extensions=_BENTOML_EXT, lower_constraint=lower_bentoml_constraint),
+    Dependencies(name='transformers', extensions=_TRANSFORMERS_EXT, lower_constraint='4.29.0'),
+    Dependencies(name='openllm-client'),
+    Dependencies(name='safetensors'),
+    Dependencies(name='optimum'),
+    Dependencies(name='ghapi'),
+    Dependencies(name='tabulate', extensions=['widechars'], lower_constraint='0.9.0'),
+    Dependencies(name='click', lower_constraint='8.1.3'),
+    Dependencies(name='cuda-python', platform=('Darwin', 'ne')),
+    Dependencies(name='bitsandbytes', upper_constraint='0.42'),  # 0.41  works with CUDA 11.8
 ]
 
-_ALL_RUNTIME_DEPS = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
-FINE_TUNE_DEPS = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
+_ALL_RUNTIME_DEPS = ['flax>=0.7', 'jax', 'jaxlib', 'tensorflow', 'keras']
+FINE_TUNE_DEPS = ['peft>=0.4.0', 'bitsandbytes', 'datasets', 'accelerate', 'trl']
 FLAN_T5_DEPS = _ALL_RUNTIME_DEPS
 OPT_DEPS = _ALL_RUNTIME_DEPS
-OPENAI_DEPS = ["openai", "tiktoken"]
-AGENTS_DEPS = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
-PLAYGROUND_DEPS = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
-GGML_DEPS = ["ctransformers"]
-GPTQ_DEPS = ["auto-gptq[triton]"]
-VLLM_DEPS = ["vllm", "ray"]
+GRPC_DEPS = ['openllm-client[grpc]']
+OPENAI_DEPS = ['openai', 'tiktoken']
+AGENTS_DEPS = ['transformers[agents]>=4.30', 'diffusers', 'soundfile']
+PLAYGROUND_DEPS = ['jupyter', 'notebook', 'ipython', 'jupytext', 'nbformat']
+GGML_DEPS = ['ctransformers']
+GPTQ_DEPS = ['auto-gptq[triton]']
+VLLM_DEPS = ['vllm', 'ray']
 
-_base_requirements: dict[str, t.Any] = {inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__}
+_base_requirements: dict[str, t.Any] = {
+    inflection.dasherize(name): config_cls.__openllm_requirements__ for name, config_cls in openllm.CONFIG_MAPPING.items() if config_cls.__openllm_requirements__
+}
 
 # shallow copy from locals()
 _locals = locals().copy()
 
 # NOTE: update this table when adding new external dependencies
 # sync with openllm.utils.OPTIONAL_DEPENDENCIES
-_base_requirements.update({v: _locals.get(f"{inflection.underscore(v).upper()}_DEPS") for v in openllm.utils.OPTIONAL_DEPENDENCIES})
+_base_requirements.update({v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES})
 
 _base_requirements = {k: v for k, v in sorted(_base_requirements.items())}
 
-fname = f"{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}"
-
+fname = f'{os.path.basename(os.path.dirname(__file__))}/{os.path.basename(__file__)}'
+def correct_style(it: t.Any) -> t.Any:
+  return it
 def create_classifiers() -> Array:
-  arr = tomlkit.array()
+  arr = correct_style(tomlkit.array())
   arr.extend([
       Classifier.create_status_classifier(5),
-      Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA"),
-      Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "12"),
-      Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.8"),
-      Classifier.create_classifier("environment", "GPU", "NVIDIA CUDA", "11.7"),
+      Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA'),
+      Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '12'),
+      Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.8'),
+      Classifier.create_classifier('environment', 'GPU', 'NVIDIA CUDA', '11.7'),
       Classifier.apache(),
-      Classifier.create_classifier("topic", "Scientific/Engineering", "Artificial Intelligence"),
-      Classifier.create_classifier("topic", "Software Development", "Libraries"),
-      Classifier.create_classifier("os", "OS Independent"),
-      Classifier.create_classifier("audience", "Developers"),
-      Classifier.create_classifier("audience", "Science/Research"),
-      Classifier.create_classifier("audience", "System Administrators"),
-      Classifier.create_classifier("typing", "Typed"), *Classifier.create_python_classifier(),
+      Classifier.create_classifier('topic', 'Scientific/Engineering', 'Artificial Intelligence'),
+      Classifier.create_classifier('topic', 'Software Development', 'Libraries'),
+      Classifier.create_classifier('os', 'OS Independent'),
+      Classifier.create_classifier('audience', 'Developers'),
+      Classifier.create_classifier('audience', 'Science/Research'),
+      Classifier.create_classifier('audience', 'System Administrators'),
+      Classifier.create_classifier('typing', 'Typed'),
+      *Classifier.create_python_classifier(),
   ])
   return arr.multiline(True)
-
 def create_optional_table() -> Table:
   all_array = tomlkit.array()
   all_array.append(f"openllm[{','.join(_base_requirements)}]")
 
   table = tomlkit.table(is_super_table=True)
-  _base_requirements.update({"full": all_array.multiline(True), "all": tomlkit.array('["openllm[full]"]')})
+  _base_requirements.update({'full': correct_style(all_array.multiline(True)), 'all': tomlkit.array('["openllm[full]"]')})
   table.update({k: v for k, v in sorted(_base_requirements.items())})
   table.add(tomlkit.nl())
 
   return table
-
 def create_url_table(_info: t.Any) -> Table:
   table = tomlkit.table()
   _urls = {
-      "Blog": "https://modelserving.com", "Chat": "https://discord.gg/openllm", "Documentation": "https://github.com/bentoml/openllm#readme",
-      "GitHub": _info.html_url,
-      "History": f"{_info.html_url}/blob/main/CHANGELOG.md",
-      "Homepage": _info.homepage,
-      "Tracker": f"{_info.html_url}/issues",
-      "Twitter": "https://twitter.com/bentomlai",
+      'Blog': 'https://modelserving.com',
+      'Chat': 'https://discord.gg/openllm',
+      'Documentation': 'https://github.com/bentoml/openllm#readme',
+      'GitHub': _info.html_url,
+      'History': f'{_info.html_url}/blob/main/CHANGELOG.md',
+      'Homepage': _info.homepage,
+      'Tracker': f'{_info.html_url}/issues',
+      'Twitter': 'https://twitter.com/bentomlai',
   }
   table.update({k: v for k, v in sorted(_urls.items())})
   return table
-
 def build_system() -> Table:
   table = tomlkit.table()
-  table.add("build-backend", "hatchling.build")
-  requires_array = tomlkit.array()
-  requires_array.extend(["hatchling==1.18.0", "hatch-vcs==0.3.0", "hatch-fancy-pypi-readme==23.1.0"])
-  table.add("requires", requires_array.multiline(True))
+  table.add('build-backend', 'hatchling.build')
+  requires_array = correct_style(tomlkit.array())
+  requires_array.extend(['hatchling==1.18.0', 'hatch-vcs==0.3.0', 'hatch-fancy-pypi-readme==23.1.0'])
+  table.add('requires', requires_array.multiline(True))
   return table
-
 def authors() -> Array:
-  arr = tomlkit.array()
-  arr.append(dict(name="Aaron Pham", email="aarnphm@bentoml.com"))
-  arr.append(dict(name="BentoML Team", email="contact@bentoml.com"))
+  arr = correct_style(tomlkit.array())
+  arr.append(dict(name='Aaron Pham', email='aarnphm@bentoml.com'))
+  arr.append(dict(name='BentoML Team', email='contact@bentoml.com'))
   return arr.multiline(True)
-
 def keywords() -> Array:
-  arr = tomlkit.array()
+  arr = correct_style(tomlkit.array())
   arr.extend([
-    "MLOps",
-    "AI",
-    "BentoML",
-    "Model Serving",
-    "Model Deployment",
-    "LLMOps",
-    "Falcon",
-    "Vicuna",
-    "Llama 2",
-    "Fine tuning",
-    "Serverless",
-    "Large Language Model",
-    "Generative AI",
-    "StableLM",
-    "Alpaca",
-    "PyTorch",
-    "Transformers"])
-  return arr
-
+      'MLOps',
+      'AI',
+      'BentoML',
+      'Model Serving',
+      'Model Deployment',
+      'LLMOps',
+      'Falcon',
+      'Vicuna',
+      'Llama 2',
+      'Fine tuning',
+      'Serverless',
+      'Large Language Model',
+      'Generative AI',
+      'StableLM',
+      'Alpaca',
+      'PyTorch',
+      'Transformers'
+  ])
+  return arr.multiline(True)
 def build_cli_extensions() -> Table:
   table = tomlkit.table()
-  ext: dict[str, str] = {"openllm": "openllm.cli.entrypoint:cli"}
-  ext.update({f"openllm-{inflection.dasherize(ke)}": f"openllm.cli.extension.{ke}:cli" for ke in sorted([fname[:-3] for fname in os.listdir(os.path.abspath(os.path.join(ROOT, "openllm-python", "src", "openllm", "cli", "extension"))) if fname.endswith(".py") and not fname.startswith("__")])})
+  ext: dict[str, str] = {'openllm': 'openllm.cli.entrypoint:cli'}
+  ext.update({
+      f'openllm-{inflection.dasherize(ke)}': f'openllm.cli.extension.{ke}:cli' for ke in sorted([
+          fname[:-3] for fname in os.listdir(os.path.abspath(os.path.join(ROOT, 'openllm-python', 'src', 'openllm', 'cli', 'extension'))) if fname.endswith('.py') and
+          not fname.startswith('__')
+      ])
+  })
   table.update(ext)
   return table
-
 def main() -> int:
   api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
   _info = api.repos.get()
-  with open(os.path.join(ROOT, "openllm-python", "pyproject.toml"), "r") as f: pyproject = tomlkit.parse(f.read())
+  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
+    pyproject = tomlkit.parse(f.read())
 
-  dependencies_array = tomlkit.array()
+  dependencies_array = correct_style(tomlkit.array())
   dependencies_array.extend([v.to_str() for v in _BASE_DEPENDENCIES])
   # dynamic field
   dyn_arr = tomlkit.array()
-  dyn_arr.extend(["version", "readme"])
+  dyn_arr.extend(['version', 'readme'])
 
-  pyproject["build-system"] = build_system()
-  pyproject["project"]["authors"] = authors()
-  pyproject["project"]["classifiers"] = create_classifiers()
-  pyproject["project"]["dependencies"] = dependencies_array.multiline(True)
-  pyproject["project"]["description"] = f"{_info.name}: {_info.description}"
-  pyproject["project"]["dynamic"] = dyn_arr
-  pyproject["project"]["keywords"] = keywords().multiline(True)
-  pyproject["project"]["license"] = _info.license.spdx_id
-  pyproject["project"]["name"] = f"{_info.name.lower()}"
-  pyproject["project"]["requires-python"] = ">=3.8"
+  pyproject['build-system'] = build_system()
+  pyproject['project']['authors'] = authors()
+  pyproject['project']['classifiers'] = create_classifiers()
+  pyproject['project']['dependencies'] = dependencies_array.multiline(True)
+  pyproject['project']['description'] = f'{_info.name}: {_info.description}'
+  pyproject['project']['dynamic'] = dyn_arr
+  pyproject['project']['keywords'] = keywords()
+  pyproject['project']['license'] = _info.license.spdx_id
+  pyproject['project']['name'] = f'{_info.name.lower()}'
+  pyproject['project']['requires-python'] = '>=3.8'
 
-  pyproject["project"]["urls"] = create_url_table(_info)
-  pyproject["project"]["scripts"] = build_cli_extensions()
-  pyproject["project"]["optional-dependencies"] = create_optional_table()
+  pyproject['project']['urls'] = create_url_table(_info)
+  pyproject['project']['scripts'] = build_cli_extensions()
+  pyproject['project']['optional-dependencies'] = create_optional_table()
 
-  with open(os.path.join(ROOT, "openllm-python", "pyproject.toml"), "w") as f: f.write(tomlkit.dumps(pyproject))
+  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'w') as f:
+    f.write(tomlkit.dumps(pyproject))
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/generate-coverage.py b/tools/generate-coverage.py
index 65132443..d3e845c4 100755
--- a/tools/generate-coverage.py
+++ b/tools/generate-coverage.py
@@ -5,44 +5,41 @@ from pathlib import Path
 
 import orjson
 from lxml import etree
-
 ROOT = Path(__file__).resolve().parent.parent
 
-PACKAGES = {"openllm-python/src/openllm/": "openllm"}
-
+PACKAGES = {'openllm-python/src/openllm/': 'openllm'}
 def main() -> int:
-  coverage_report = ROOT/"coverage.xml"
+  coverage_report = ROOT / 'coverage.xml'
   root = etree.fromstring(coverage_report.read_text())
 
-  raw_package_data: defaultdict[str, dict[str, int]] = defaultdict(lambda: {"hits": 0, "misses": 0})
-  for package in root.find("packages"):
-    for module in package.find("classes"):
-      filename = module.attrib["filename"]
+  raw_package_data: defaultdict[str, dict[str, int]] = defaultdict(lambda: {'hits': 0, 'misses': 0})
+  for package in root.find('packages'):
+    for module in package.find('classes'):
+      filename = module.attrib['filename']
       for relative_path, package_name in PACKAGES.items():
         if filename.startswith(relative_path):
           data = raw_package_data[package_name]
           break
       else:
-        message = f"unknown package: {module}"
+        message = f'unknown package: {module}'
         raise ValueError(message)
 
-      for line in module.find("lines"):
-        if line.attrib["hits"] == "1": data["hits"] += 1
-        else: data["misses"] += 1
+      for line in module.find('lines'):
+        if line.attrib['hits'] == '1': data['hits'] += 1
+        else: data['misses'] += 1
 
   total_statements_covered = 0
   total_statements = 0
   coverage_data = {}
   for package_name, data in sorted(raw_package_data.items()):
-    statements_covered = data["hits"]
-    statements = statements_covered + data["misses"]
+    statements_covered = data['hits']
+    statements = statements_covered + data['misses']
     total_statements_covered += statements_covered
     total_statements += statements
-    coverage_data[package_name] = {"statements_covered": statements_covered, "statements": statements}
-  coverage_data["total"] = {"statements_covered": total_statements_covered, "statements": total_statements}
+    coverage_data[package_name] = {'statements_covered': statements_covered, 'statements': statements}
+  coverage_data['total'] = {'statements_covered': total_statements_covered, 'statements': total_statements}
 
-  coverage_summary = ROOT/"coverage-summary.json"
-  coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding="utf-8")
+  coverage_summary = ROOT / 'coverage-summary.json'
+  coverage_summary.write_text(orjson.dumps(coverage_data, option=orjson.OPT_INDENT_2).decode(), encoding='utf-8')
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-brew-tap.py b/tools/update-brew-tap.py
index 43dd7bed..fc94a3a1 100755
--- a/tools/update-brew-tap.py
+++ b/tools/update-brew-tap.py
@@ -12,31 +12,46 @@ if t.TYPE_CHECKING: from plumbum.commands.base import Pipeline
 # get git root from this file
 ROOT = Path(__file__).parent.parent
 
-_OWNER = "bentoml"
-_REPO = "openllm"
+_OWNER = 'bentoml'
+_REPO = 'openllm'
 
-_gz_strategies: dict[t.Literal["macos_arm", "macos_intel", "linux_intel"], str] = {"macos_arm": "aarch64-apple-darwin", "macos_intel": "x86_64-apple-darwin", "linux_intel": "x86_64-unknown-linux-musl"}
-
-def determine_release_url(svn_url: str, tag: str, target: t.Literal["macos_arm", "macos_intel", "linux_intel", "archive"]) -> str:
-  if target == "archive": return f"{svn_url}/archive/{tag}.tar.gz"
+_gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] = {
+    'macos_arm': 'aarch64-apple-darwin', 'macos_intel': 'x86_64-apple-darwin', 'linux_intel': 'x86_64-unknown-linux-musl'
+}
+def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str:
+  if target == 'archive': return f'{svn_url}/archive/{tag}.tar.gz'
   return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz"
-
 # curl -sSL <svn_url>/archive/refs/tags/<tag>.tar.gz | shasum -a256 | cut -d'' -f1
-def get_release_hash_command(svn_url: str, tag: str) -> Pipeline: return curl["-sSL", svn_url] | shasum["-a256"] | cut["-d", " ", "-f1"]
-
+def get_release_hash_command(svn_url: str, tag: str) -> Pipeline:
+  return curl['-sSL', svn_url] | shasum['-a256'] | cut['-d', ' ', '-f1']
 def main() -> int:
   api = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
   _info = api.repos.get()
   release_tag = api.repos.get_latest_release().name
 
   shadict: dict[str, t.Any] = {k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies}
-  shadict["archive"] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, "archive"), release_tag)().strip()
+  shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'), release_tag)().strip()
 
-  ENVIRONMENT = Environment(extensions=["jinja2.ext.do", "jinja2.ext.loopcontrols", "jinja2.ext.debug"], trim_blocks=True, lstrip_blocks=True, loader=FileSystemLoader((ROOT / "Formula").__fspath__(), followlinks=True))
-  template_file = "openllm.rb.j2"
-  with (ROOT/"Formula"/"openllm.rb").open("w") as f:
-    f.write(ENVIRONMENT.get_template(template_file, globals={"determine_release_url": determine_release_url}).render(shadict=shadict, __tag__=release_tag, __cmd__=fs.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)), __template_file__=fs.path.join("Formula", template_file), __gz_extension__=_gz_strategies, **_info))
-    f.write("\n")
+  ENVIRONMENT = Environment(
+      extensions=['jinja2.ext.do', 'jinja2.ext.loopcontrols', 'jinja2.ext.debug'],
+      trim_blocks=True,
+      lstrip_blocks=True,
+      loader=FileSystemLoader((ROOT / 'Formula').__fspath__(), followlinks=True)
+  )
+  template_file = 'openllm.rb.j2'
+  with (ROOT / 'Formula' / 'openllm.rb').open('w') as f:
+    f.write(
+        ENVIRONMENT.get_template(template_file, globals={
+            'determine_release_url': determine_release_url
+        }).render(
+            shadict=shadict,
+            __tag__=release_tag,
+            __cmd__=fs.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)),
+            __template_file__=fs.path.join('Formula', template_file),
+            __gz_extension__=_gz_strategies,
+            **_info
+        )
+    )
+    f.write('\n')
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
index fbfdc2ef..bc849797 100755
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -4,40 +4,38 @@ import os, sys
 from pathlib import Path
 
 # currently we are assuming the indentatio level is 2 for comments
-START_COMMENT = f"# {os.path.basename(__file__)}: start\n"
-END_COMMENT = f"# {os.path.basename(__file__)}: stop\n"
-START_SPECIAL_COMMENT = f"# {os.path.basename(__file__)}: special start\n"
-END_SPECIAL_COMMENT = f"# {os.path.basename(__file__)}: special stop\n"
-START_ATTRS_COMMENT = f"# {os.path.basename(__file__)}: attrs start\n"
-END_ATTRS_COMMENT = f"# {os.path.basename(__file__)}: attrs stop\n"
+START_COMMENT = f'# {os.path.basename(__file__)}: start\n'
+END_COMMENT = f'# {os.path.basename(__file__)}: stop\n'
+START_SPECIAL_COMMENT = f'# {os.path.basename(__file__)}: special start\n'
+END_SPECIAL_COMMENT = f'# {os.path.basename(__file__)}: special stop\n'
+START_ATTRS_COMMENT = f'# {os.path.basename(__file__)}: attrs start\n'
+END_ATTRS_COMMENT = f'# {os.path.basename(__file__)}: attrs stop\n'
 
 ROOT = Path(__file__).parent.parent
-_TARGET_FILE = ROOT/"openllm-core"/"src"/"openllm_core"/"_configuration.py"
+_TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
 
-sys.path.insert(0, (ROOT/"openllm-core"/"src").__fspath__())
+sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
 from openllm_core._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
 from openllm_core.utils import codegen
-
 def process_annotations(annotations: str) -> str:
-  if "NotRequired" in annotations: return annotations[len("NotRequired["):-1]
-  elif "Required" in annotations: return annotations[len("Required["):-1]
+  if 'NotRequired' in annotations: return annotations[len('NotRequired['):-1]
+  elif 'Required' in annotations: return annotations[len('Required['):-1]
   else: return annotations
-
 _value_docstring = {
-    "default_id": """Return the default model to use when using 'openllm start <model_id>'.
+    'default_id': '''Return the default model to use when using 'openllm start <model_id>'.
         This could be one of the keys in 'self.model_ids' or custom users model.
 
         This field is required when defining under '__config__'.
-        """,
-    "model_ids": """A list of supported pretrained models tag for this given runnable.
+        ''',
+    'model_ids': '''A list of supported pretrained models tag for this given runnable.
 
         For example:
             For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
                                             "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]
 
         This field is required when defining under '__config__'.
-        """,
-    "architecture": """The model architecture that is supported by this LLM.
+        ''',
+    'architecture': '''The model architecture that is supported by this LLM.
 
         Note that any model weights within this architecture generation can always be run and supported by this LLM.
 
@@ -46,30 +44,30 @@ _value_docstring = {
 
             ```bash
             openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
-            ```""",
-    "default_implementation": """The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.
+            ```''',
+    'default_implementation': '''The default runtime to run this LLM. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`.
 
     It is a dictionary of key as the accelerator spec in k4s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM Runtime ('flax', 'tf', 'pt', 'vllm')
-    """,
-    "url": """The resolved url for this LLMConfig.""",
-    "requires_gpu": """Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.""",
-    "trust_remote_code": """Whether to always trust remote code""",
-    "service_name": """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'""",
-    "requirements": """The default PyPI requirements needed to run this given LLM. By default, we will depend on
-        bentoml, torch, transformers.""",
-    "bettertransformer": """Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.""",
-    "model_type": """The model type for this given LLM. By default, it should be causal language modeling.
+    ''',
+    'url': '''The resolved url for this LLMConfig.''',
+    'requires_gpu': '''Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.''',
+    'trust_remote_code': '''Whether to always trust remote code''',
+    'service_name': """Generated service name for this LLMConfig. By default, it is 'generated_{model_name}_service.py'""",
+    'requirements': '''The default PyPI requirements needed to run this given LLM. By default, we will depend on
+        bentoml, torch, transformers.''',
+    'bettertransformer': '''Whether to use BetterTransformer for this given LLM. This depends per model architecture. By default, we will use BetterTransformer for T5 and StableLM models, and set to False for every other models.''',
+    'model_type': '''The model type for this given LLM. By default, it should be causal language modeling.
         Currently supported 'causal_lm' or 'seq2seq_lm'
-        """,
-    "runtime": """The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.""",
-    "name_type": """The default name typed for this model. "dasherize" will convert the name to lowercase and
+        ''',
+    'runtime': '''The runtime to use for this model. Possible values are `transformers` or `ggml`. See Llama for more information.''',
+    'name_type': '''The default name typed for this model. "dasherize" will convert the name to lowercase and
         replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
-        `model_name` and `start_name` must be specified.""",
-    "model_name": """The normalized version of __openllm_start_name__, determined by __openllm_name_type__""",
-    "start_name": """Default name to be used with `openllm start`""",
-    "env": """A EnvVarMixin instance for this LLMConfig.""",
-    "timeout": """The default timeout to be set for this given LLM.""",
-    "workers_per_resource": """The number of workers per resource. This is used to determine the number of workers to use for this model.
+        `model_name` and `start_name` must be specified.''',
+    'model_name': '''The normalized version of __openllm_start_name__, determined by __openllm_name_type__''',
+    'start_name': '''Default name to be used with `openllm start`''',
+    'env': '''A EnvVarMixin instance for this LLMConfig.''',
+    'timeout': '''The default timeout to be set for this given LLM.''',
+    'workers_per_resource': '''The number of workers per resource. This is used to determine the number of workers to use for this model.
         For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
         OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.
 
@@ -77,50 +75,64 @@ _value_docstring = {
         https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.
 
         By default, it is set to 1.
-        """,
-    "fine_tune_strategies": """The fine-tune strategies for this given LLM.""",
-    "tokenizer_class": """Optional tokenizer class for this given LLM. See Llama for example.""",
+        ''',
+    'fine_tune_strategies': '''The fine-tune strategies for this given LLM.''',
+    'tokenizer_class': '''Optional tokenizer class for this given LLM. See Llama for example.''',
 }
 
-_transformed = {"fine_tune_strategies": "t.Dict[AdapterType, FineTuneConfig]"}
-
+_transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}
 def main() -> int:
-  with _TARGET_FILE.open("r") as f: processed = f.readlines()
+  with _TARGET_FILE.open('r') as f:
+    processed = f.readlines()
 
-  start_idx, end_idx = processed.index(" "*2 + START_COMMENT), processed.index(" "*2 + END_COMMENT)
-  start_stub_idx, end_stub_idx = processed.index(" "*4 + START_SPECIAL_COMMENT), processed.index(" "*4 + END_SPECIAL_COMMENT)
-  start_attrs_idx, end_attrs_idx = processed.index(" "*4 + START_ATTRS_COMMENT), processed.index(" "*4 + END_ATTRS_COMMENT)
+  start_idx, end_idx = processed.index(' '*2 + START_COMMENT), processed.index(' '*2 + END_COMMENT)
+  start_stub_idx, end_stub_idx = processed.index(' '*4 + START_SPECIAL_COMMENT), processed.index(' '*4 + END_SPECIAL_COMMENT)
+  start_attrs_idx, end_attrs_idx = processed.index(' '*4 + START_ATTRS_COMMENT), processed.index(' '*4 + END_ATTRS_COMMENT)
 
   # NOTE: inline stubs __config__ attrs representation
   special_attrs_lines: list[str] = []
-  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
+  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
+    special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
   # NOTE: inline stubs for _ConfigAttr type stubs
   config_attr_lines: list[str] = []
   for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
-    config_attr_lines.extend([" "*4 + line for line in [f"__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n", f'"""{_value_docstring[keys]}"""\n',]])
+    config_attr_lines.extend([
+        ' '*4 + line for line in [f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n', f'"""{_value_docstring[keys]}"""\n',]
+    ])
   # NOTE: inline runtime __getitem__ overload process
   lines: list[str] = []
-  lines.append(" "*2 + "# NOTE: ModelSettings arguments\n")
-  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n',]])
+  lines.append(' '*2 + '# NOTE: ModelSettings arguments\n')
+  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
+    lines.extend([
+        ' '*2 + line for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n',]
+    ])
   # special case variables: generation_class, extras, sampling_class
-  lines.append(" "*2 + "# NOTE: generation_class, sampling_class and extras arguments\n")
+  lines.append(' '*2 + '# NOTE: generation_class, sampling_class and extras arguments\n')
   lines.extend([
-  " "*2 + line for line in [
-    "@overload\n", 'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...\n',
-    "@overload\n", 'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...\n',
-    "@overload\n", 'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n',
-  ]])
-  lines.append(" "*2 + "# NOTE: GenerationConfig arguments\n")
+      ' '*2 + line for line in [
+          '@overload\n',
+          'def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...\n',
+          '@overload\n',
+          'def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...\n',
+          '@overload\n',
+          'def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...\n',
+      ]
+  ])
+  lines.append(' '*2 + '# NOTE: GenerationConfig arguments\n')
   generation_config_anns = codegen.get_annotations(GenerationConfig)
-  for keys, type_pep563 in generation_config_anns.items(): lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n']])
-  lines.append(" "*2 + "# NOTE: SamplingParams arguments\n")
+  for keys, type_pep563 in generation_config_anns.items():
+    lines.extend([' '*2 + line for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n']])
+  lines.append(' '*2 + '# NOTE: SamplingParams arguments\n')
   for keys, type_pep563 in codegen.get_annotations(SamplingParams).items():
-    if keys not in generation_config_anns: lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',]])
-  lines.append(" "*2 + "# NOTE: PeftType arguments\n")
-  for keys in PeftType._member_names_: lines.extend([" "*2 + line for line in ["@overload\n", f'def __getitem__(self, item: t.Literal["{keys.lower()}"]) -> dict[str, t.Any]: ...\n',]])
+    if keys not in generation_config_anns: lines.extend([' '*2 + line for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys}"]) -> {type_pep563}: ...\n',]])
+  lines.append(' '*2 + '# NOTE: PeftType arguments\n')
+  for keys in PeftType._member_names_:
+    lines.extend([' '*2 + line for line in ['@overload\n', f'def __getitem__(self, item: t.Literal["{keys.lower()}"]) -> dict[str, t.Any]: ...\n',]])
 
-  processed = processed[:start_attrs_idx] + [" "*4 + START_ATTRS_COMMENT, *special_attrs_lines, " "*4 + END_ATTRS_COMMENT] + processed[end_attrs_idx + 1:start_stub_idx] + [" "*4 + START_SPECIAL_COMMENT, *config_attr_lines, " "*4 + END_SPECIAL_COMMENT] + processed[end_stub_idx + 1:start_idx] + [" "*2 + START_COMMENT, *lines, " "*2 + END_COMMENT] + processed[end_idx + 1:]
-  with _TARGET_FILE.open("w") as f: f.writelines(processed)
+  processed = processed[:start_attrs_idx] + [' '*4 + START_ATTRS_COMMENT, *special_attrs_lines, ' '*4 + END_ATTRS_COMMENT] + processed[end_attrs_idx + 1:start_stub_idx] + [
+      ' '*4 + START_SPECIAL_COMMENT, *config_attr_lines, ' '*4 + END_SPECIAL_COMMENT
+  ] + processed[end_stub_idx + 1:start_idx] + [' '*2 + START_COMMENT, *lines, ' '*2 + END_COMMENT] + processed[end_idx + 1:]
+  with _TARGET_FILE.open('w') as f:
+    f.writelines(processed)
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-dummy.py b/tools/update-dummy.py
index 3714f561..b610812e 100755
--- a/tools/update-dummy.py
+++ b/tools/update-dummy.py
@@ -4,48 +4,54 @@ import os, typing as t, sys
 from pathlib import Path
 _ROOT = Path(__file__).parent.parent
 
-sys.path.insert(0, (_ROOT/"openllm-core"/"src").__fspath__())
-sys.path.insert(1, (_ROOT/"openllm-python"/"src").__fspath__())
+sys.path.insert(0, (_ROOT / 'openllm-core' / 'src').__fspath__())
+sys.path.insert(1, (_ROOT / 'openllm-python' / 'src').__fspath__())
 from openllm_core._configuration import LiteralRuntime
 from openllm.models import auto
 from openllm import CONFIG_MAPPING
 
 if t.TYPE_CHECKING: from collections import OrderedDict
 
-config_requirements = {k:[_.replace("-", "_") for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k,v in CONFIG_MAPPING.items()}
-_dependencies: dict[LiteralRuntime,str] = {k:v for k,v in zip(LiteralRuntime.__args__, ("torch", "tensorflow", "flax", "vllm"))}
-_auto: dict[str,str] = {k:v for k,v in zip(LiteralRuntime.__args__, ("AutoLLM", "AutoTFLLM", "AutoFlaxLLM", "AutoVLLM"))}
-
-def get_target_dummy_file(framework: LiteralRuntime) -> Path: return _ROOT/"openllm-python"/"src"/"openllm"/"utils"/f"dummy_{framework}_objects.py"
-def mapping_names(framework: LiteralRuntime): return "MODEL_MAPPING_NAMES" if framework == "pt" else f"MODEL_{framework.upper()}_MAPPING_NAMES"
-def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]: return getattr(auto, mapping_names(framework))
-
+config_requirements = {k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()}
+_dependencies: dict[LiteralRuntime, str] = {k: v for k, v in zip(LiteralRuntime.__args__, ('torch', 'tensorflow', 'flax', 'vllm'))}
+_auto: dict[str, str] = {k: v for k, v in zip(LiteralRuntime.__args__, ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))}
+def get_target_dummy_file(framework: LiteralRuntime) -> Path:
+  return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{framework}_objects.py'
+def mapping_names(framework: LiteralRuntime):
+  return 'MODEL_MAPPING_NAMES' if framework == 'pt' else f'MODEL_{framework.upper()}_MAPPING_NAMES'
+def get_mapping(framework: LiteralRuntime) -> OrderedDict[t.Any, t.Any]:
+  return getattr(auto, mapping_names(framework))
 def make_class_stub(model_name: str, framework: LiteralRuntime, indentation: int = 2, auto: bool = False) -> list[str]:
-  _dep_list: list[str] = [f'"{v}"' for v in [_dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != "__default__" and config_requirements[model_name] else [])]]
+  _dep_list: list[str] = [
+      f'"{v}"' for v in [_dependencies[framework], *(t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])]
+  ]
   if auto: cl_ = _auto[framework]
   else: cl_ = get_mapping(framework)[model_name]
-  lines = [f"class {cl_}(metaclass=_DummyMetaclass):"," "*indentation + f"_backends=[{','.join(_dep_list)}]"," "*indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])"]
+  lines = [
+      f'class {cl_}(metaclass=_DummyMetaclass):',
+      ' '*indentation + f"_backends=[{','.join(_dep_list)}]",
+      ' '*indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])"
+  ]
   return lines
-
 def write_stub(framework: LiteralRuntime, _path: str) -> list[str]:
-  base = [f"# This file is generated by {_path}. DO NOT EDIT MANUALLY!",
-          f"# To update this, run ./{_path}",
-          "from __future__ import annotations",
-          "import typing as _t",
-          "from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends",
-          ]
+  base = [
+      f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!',
+      f'# To update this, run ./{_path}',
+      'from __future__ import annotations',
+      'import typing as _t',
+      'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends',
+  ]
   base.extend([v for it in [make_class_stub(k, framework) for k in get_mapping(framework)] for v in it])
   # autoclass
-  base.extend(make_class_stub("__default__", framework, auto=True))
+  base.extend(make_class_stub('__default__', framework, auto=True))
   # mapping and export
   _imports = [f'"{v}"' for v in get_mapping(framework).values()]
-  base += [f"{mapping_names(framework)}:_t.Any=None", f"__all__:list[str]=[\"{mapping_names(framework)}\",\"{_auto[framework]}\",{','.join(_imports)}]\n"]
+  base += [f'{mapping_names(framework)}:_t.Any=None', f"__all__:list[str]=[\"{mapping_names(framework)}\",\"{_auto[framework]}\",{','.join(_imports)}]\n"]
   return base
-
 def main() -> int:
   _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
   for framework in _dependencies:
-    with get_target_dummy_file(framework).open("w") as f: f.write("\n".join(write_stub(framework, _path)))
+    with get_target_dummy_file(framework).open('w') as f:
+      f.write('\n'.join(write_stub(framework, _path)))
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-models-import.py b/tools/update-models-import.py
index 1f201090..098e3147 100755
--- a/tools/update-models-import.py
+++ b/tools/update-models-import.py
@@ -2,25 +2,31 @@
 from __future__ import annotations
 import os
 from pathlib import Path
-
-_TARGET_FILE = Path(__file__).parent.parent/"openllm-python"/"src"/"openllm"/"models"/"__init__.py"
-
+_TARGET_FILE = Path(__file__).parent.parent / 'openllm-python' / 'src' / 'openllm' / 'models' / '__init__.py'
 def create_module_import() -> str:
   r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']]
   return f"_MODELS:set[str]={{{', '.join(sorted(r))}}}"
-def create_stubs_import() -> list[str]: return ["if t.TYPE_CHECKING:from . import "+",".join([f"{p.name} as {p.name}" for p in sorted(_TARGET_FILE.parent.glob("*/")) if p.name not in {"__pycache__", "__init__.py", ".DS_Store"}]),
-                                                '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', "__all__=__lazy.__all__", "__dir__=__lazy.__dir__", "__getattr__=__lazy.__getattr__\n"]
-
+def create_stubs_import() -> list[str]:
+  return [
+      'if t.TYPE_CHECKING:from . import ' + ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]),
+      '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})',
+      '__all__=__lazy.__all__',
+      '__dir__=__lazy.__dir__',
+      '__getattr__=__lazy.__getattr__\n'
+  ]
 def main() -> int:
   _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
-  with _TARGET_FILE.open("w") as f: f.writelines("\n".join([
-    f"# This file is generated by {_path}. DO NOT EDIT MANUALLY!",
-    f"# To update this, run ./{_path}",
-    "from __future__ import annotations",
-    "import typing as t",
-    "from openllm_core.utils import LazyModule",
-    create_module_import(),
-    *create_stubs_import(),
-  ]))
+  with _TARGET_FILE.open('w') as f:
+    f.writelines(
+        '\n'.join([
+            f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!',
+            f'# To update this, run ./{_path}',
+            'from __future__ import annotations',
+            'import typing as t',
+            'from openllm_core.utils import LazyModule',
+            create_module_import(),
+            *create_stubs_import(),
+        ])
+    )
   return 0
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/update-readme.py b/tools/update-readme.py
index a5f2f5ba..398002f7 100755
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -2,54 +2,58 @@
 from __future__ import annotations
 import os, inflection, tomlkit, sys
 import typing as t
-
-START_COMMENT = f"<!-- {os.path.basename(__file__)}: start -->\n"
-END_COMMENT = f"<!-- {os.path.basename(__file__)}: stop -->\n"
+START_COMMENT = f'<!-- {os.path.basename(__file__)}: start -->\n'
+END_COMMENT = f'<!-- {os.path.basename(__file__)}: stop -->\n'
 
 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0, os.path.join(ROOT,"openllm-python","src"))
+sys.path.insert(0, os.path.join(ROOT, 'openllm-python', 'src'))
 import openllm
-
 def main() -> int:
-  with open(os.path.join(ROOT, "openllm-python", "pyproject.toml"), "r") as f: deps = tomlkit.parse(f.read()).value["project"]["optional-dependencies"]
-  with open(os.path.join(ROOT, "README.md"), "r") as f: readme = f.readlines()
+  with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
+    deps = tomlkit.parse(f.read()).value['project']['optional-dependencies']
+  with open(os.path.join(ROOT, 'README.md'), 'r') as f:
+    readme = f.readlines()
 
   start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
-  formatted: dict[t.Literal["Model", "Architecture", "URL", "Installation", "Model Ids"], list[str | list[str]]] = {"Model": [], "Architecture": [], "URL": [], "Model Ids": [], "Installation": [],}
+  formatted: dict[t.Literal['Model', 'Architecture', 'URL', 'Installation', 'Model Ids'], list[str | list[str]]] = {
+      'Model': [], 'Architecture': [], 'URL': [], 'Model Ids': [], 'Installation': [],
+  }
   max_install_len_div = 0
   for name, config_cls in openllm.CONFIG_MAPPING.items():
     dashed = inflection.dasherize(name)
-    formatted["Model"].append(dashed)
-    formatted["Architecture"].append(config_cls.__openllm_architecture__)
-    formatted["URL"].append(config_cls.__openllm_url__)
-    formatted["Model Ids"].append(config_cls.__openllm_model_ids__)
+    formatted['Model'].append(dashed)
+    formatted['Architecture'].append(config_cls.__openllm_architecture__)
+    formatted['URL'].append(config_cls.__openllm_url__)
+    formatted['Model Ids'].append(config_cls.__openllm_model_ids__)
     if dashed in deps: instruction = f'```bash\npip install "openllm[{dashed}]"\n```'
-    else: instruction = "```bash\npip install openllm\n```"
+    else: instruction = '```bash\npip install openllm\n```'
     if len(instruction) > max_install_len_div: max_install_len_div = len(instruction)
-    formatted["Installation"].append(instruction)
-  meta: list[str] = ["\n", "<table align='center'>\n"]
+    formatted['Installation'].append(instruction)
+  meta: list[str] = ['\n', "<table align='center'>\n"]
 
   # NOTE: headers
-  meta += ["<tr>\n"]
-  meta.extend([f"<th>{header}</th>\n" for header in formatted.keys() if header not in ("URL",)])
-  meta += ["</tr>\n"]
+  meta += ['<tr>\n']
+  meta.extend([f'<th>{header}</th>\n' for header in formatted.keys() if header not in ('URL',)])
+  meta += ['</tr>\n']
   # NOTE: rows
   for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]], zip(*formatted.values())):
-    meta += "<tr>\n"
+    meta += '<tr>\n'
     # configure architecture URL
     cfg_cls = openllm.CONFIG_MAPPING[name]
-    if cfg_cls.__openllm_trust_remote_code__: arch = f"<td><a href={url}><code>{architecture}</code></a></td>\n"
-    else: arch = f"<td><a href=https://huggingface.co/docs/transformers/main/model_doc/{dict(dolly_v2='gpt_neox',stablelm='gpt_neox', starcoder='gpt_bigcode', flan_t5='t5').get(cfg_cls.__openllm_model_name__, cfg_cls.__openllm_model_name__)}#transformers.{architecture}><code>{architecture}</code></a></td>\n"
-    meta.extend([f"\n<td><a href={url}>{name}</a></td>\n", arch])
+    if cfg_cls.__openllm_trust_remote_code__: arch = f'<td><a href={url}><code>{architecture}</code></a></td>\n'
+    else:
+      arch = f"<td><a href=https://huggingface.co/docs/transformers/main/model_doc/{dict(dolly_v2='gpt_neox',stablelm='gpt_neox', starcoder='gpt_bigcode', flan_t5='t5').get(cfg_cls.__openllm_model_name__, cfg_cls.__openllm_model_name__)}#transformers.{architecture}><code>{architecture}</code></a></td>\n"
+    meta.extend([f'\n<td><a href={url}>{name}</a></td>\n', arch])
     format_with_links: list[str] = []
-    for lid in model_ids: format_with_links.append(f"<li><a href=https://huggingface.co/{lid}><code>{lid}</code></a></li>")
-    meta.append("<td>\n\n<ul>" + "\n".join(format_with_links) + "</ul>\n\n</td>\n")
-    meta.append(f"<td>\n\n{installation}\n\n</td>\n")
-    meta += "</tr>\n"
-  meta.extend(["</table>\n", "\n"])
+    for lid in model_ids:
+      format_with_links.append(f'<li><a href=https://huggingface.co/{lid}><code>{lid}</code></a></li>')
+    meta.append('<td>\n\n<ul>' + '\n'.join(format_with_links) + '</ul>\n\n</td>\n')
+    meta.append(f'<td>\n\n{installation}\n\n</td>\n')
+    meta += '</tr>\n'
+  meta.extend(['</table>\n', '\n'])
 
   readme = readme[:start_index] + [START_COMMENT] + meta + [END_COMMENT] + readme[stop_index + 1:]
-  with open(os.path.join(ROOT, "README.md"), "w") as f: f.writelines(readme)
+  with open(os.path.join(ROOT, 'README.md'), 'w') as f:
+    f.writelines(readme)
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/write-coverage-report.py b/tools/write-coverage-report.py
index c3af337d..53edac46 100755
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -3,38 +3,36 @@ from __future__ import annotations
 from decimal import ROUND_DOWN, Decimal
 from pathlib import Path
 import orjson
-
-PRECISION = Decimal(".01")
+PRECISION = Decimal('.01')
 
 ROOT = Path(__file__).resolve().parent.parent
-
 def main() -> int:
-  coverage_summary = ROOT/"coverage-summary.json"
+  coverage_summary = ROOT / 'coverage-summary.json'
 
-  coverage_data = orjson.loads(coverage_summary.read_text(encoding="utf-8"))
-  total_data = coverage_data.pop("total")
+  coverage_data = orjson.loads(coverage_summary.read_text(encoding='utf-8'))
+  total_data = coverage_data.pop('total')
 
-  lines = ["\n", "Package | Statements\n", "------- | ----------\n",]
+  lines = ['\n', 'Package | Statements\n', '------- | ----------\n',]
 
   for package, data in sorted(coverage_data.items()):
-    statements_covered = data["statements_covered"]
-    statements = data["statements"]
+    statements_covered = data['statements_covered']
+    statements = data['statements']
 
     rate = Decimal(statements_covered) / Decimal(statements) * 100
     rate = rate.quantize(PRECISION, rounding=ROUND_DOWN)
-    lines.append(f"{package} | {100 if rate == 100 else rate}% ({statements_covered} / {statements})\n")
+    lines.append(f'{package} | {100 if rate == 100 else rate}% ({statements_covered} / {statements})\n')
 
-  total_statements_covered = total_data["statements_covered"]
-  total_statements = total_data["statements"]
+  total_statements_covered = total_data['statements_covered']
+  total_statements = total_data['statements']
   total_rate = Decimal(total_statements_covered) / Decimal(total_statements) * 100
   total_rate = total_rate.quantize(PRECISION, rounding=ROUND_DOWN)
-  color = "ok" if float(total_rate) >= 95 else "critical"
-  lines.insert(0, f"![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n")
+  color = 'ok' if float(total_rate) >= 95 else 'critical'
+  lines.insert(0, f'![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n')
 
-  lines.append(f"**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n")
+  lines.append(f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n')
 
-  coverage_report = ROOT/"coverage-report.md"
-  with coverage_report.open("w", encoding="utf-8") as f: f.write("".join(lines))
+  coverage_report = ROOT / 'coverage-report.md'
+  with coverage_report.open('w', encoding='utf-8') as f:
+    f.write(''.join(lines))
   return 0
-
-if __name__ == "__main__": raise SystemExit(main())
+if __name__ == '__main__': raise SystemExit(main())
diff --git a/tools/yapf b/tools/yapf
deleted file mode 100755
index 2389ae42..00000000
--- a/tools/yapf
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-
-# Check if yapf is installed, otherwise exit 1
-[[ -x "$(command -v yapf)" ]] || (
-    echo "yapf not found"
-    exit 1
-)
-
-yapf -pri openllm-python/** 2>/dev/null
-yapf -pri  openllm-core/** 2>/dev/null
-yapf -pri  openllm-client/** 2>/dev/null