fix(yapf): align weird new lines break [generated] [skip ci] (#284)

fix(yapf): align weird new lines break Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-09-01 05:34:22 -04:00
parent 3e45530abd
commit b7af7765d4
91 changed files with 811 additions and 1678 deletions
--- a/cz.py
+++ b/cz.py
@@ -19,10 +19,7 @@ def run_cz(dir: str, package: str):
      with tokenize.open(filepath) as file_:
        tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST]
        token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens]))
-        table.append([
-            filepath.replace(os.path.join(dir, 'src'), ''), line_count,
-            token_count / line_count if line_count != 0 else 0
-        ])
+        table.append([filepath.replace(os.path.join(dir, 'src'), ''), line_count, token_count / line_count if line_count != 0 else 0])
  print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers='firstrow', floatfmt='.1f') + '\n')
  for dir_name, group in itertools.groupby(sorted([(x[0].rsplit('/', 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
    print(f'{dir_name:35s} : {sum([x[1] for x in group]):6d}')
--- a/examples/langchain-chains-demo/service.py
+++ b/examples/langchain-chains-demo/service.py
@@ -44,17 +44,11 @@ svc = bentoml.Service("fb-ads-copy", runners=[llm.runner])
 def download(_: bentoml.Context):
  llm.runner.download_model()

-SAMPLE_INPUT = Query(
-    industry="SAAS",
-    product_name="BentoML",
-    keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"],
-    llm_config=llm.runner.config.model_dump(),
-)
+SAMPLE_INPUT = Query(industry="SAAS",
+                     product_name="BentoML",
+                     keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"],
+                     llm_config=llm.runner.config.model_dump())

@svc.api(input=JSON.from_sample(sample=SAMPLE_INPUT), output=Text())
 def generate(query: Query):
-  return chain.run({
-      "industry": query.industry,
-      "product_name": query.product_name,
-      "keywords": ", ".join(query.keywords)
-  })
+  return chain.run({"industry": query.industry, "product_name": query.product_name, "keywords": ", ".join(query.keywords)})
--- a/openllm-client/src/openllm_client/_base.py
+++ b/openllm-client/src/openllm_client/_base.py
@@ -65,10 +65,7 @@ class _ClientAttr:
    ...

  @abc.abstractmethod
-  def query(self,
-            prompt: str,
-            return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed',
-            **attrs: t.Any) -> t.Any:
+  def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any:
    raise NotImplementedError

  # NOTE: Scikit interface
@@ -84,8 +81,7 @@ class _ClientAttr:

  @overload
  @abc.abstractmethod
-  def predict(self, prompt: str, *, return_response: t.Literal['attrs'],
-              **attrs: t.Any) -> openllm_core.GenerationOutput:
+  def predict(self, prompt: str, *, return_response: t.Literal['attrs'], **attrs: t.Any) -> openllm_core.GenerationOutput:
    ...

  @abc.abstractmethod
@@ -95,14 +91,12 @@ class _ClientAttr:
  @functools.cached_property
  def _hf_agent(self) -> transformers.HfAgent:
    if not is_transformers_available():
-      raise RuntimeError(
-          "transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
+      raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
    if not self.supports_hf_agent:
      raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.')
    if not is_transformers_supports_agent():
      raise RuntimeError(
-          "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'"
-      )
+          "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
    import transformers
    return transformers.HfAgent(urljoin(self._address, '/hf/agent'))

@@ -183,13 +177,7 @@ class _Client(_ClientAttr):
    return BentoClient.from_url(self._address)

  # Agent integration
-  def ask_agent(self,
-                task: str,
-                *,
-                return_code: bool = False,
-                remote: bool = False,
-                agent_type: LiteralString = 'hf',
-                **attrs: t.Any) -> t.Any:
+  def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = 'hf', **attrs: t.Any) -> t.Any:
    if agent_type == 'hf': return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")

@@ -223,20 +211,13 @@ class _AsyncClient(_ClientAttr):
    return ensure_exec_coro(AsyncBentoClient.from_url(self._address))

  # Agent integration
-  async def ask_agent(self,
-                      task: str,
-                      *,
-                      return_code: bool = False,
-                      remote: bool = False,
-                      agent_type: LiteralString = 'hf',
-                      **attrs: t.Any) -> t.Any:
+  async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = 'hf', **attrs: t.Any) -> t.Any:
    if agent_type == 'hf': return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
    else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")

  async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
    if not is_transformers_supports_agent():
-      raise RuntimeError(
-          'This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0')
+      raise RuntimeError('This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0')
    if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
    from transformers.tools.agents import clean_code_for_run
    from transformers.tools.agents import get_tool_creation_code
@@ -272,31 +253,23 @@ class _AsyncClient(_ClientAttr):
    self._hf_agent.log(f'\n\n==Code generated by the agent==\n{code}')
    if not return_code:
      self._hf_agent.log('\n\n==Result==')
-      self._hf_agent.cached_tools = resolve_tools(code,
-                                                  self._hf_agent.toolbox,
-                                                  remote=remote,
-                                                  cached_tools=self._hf_agent.cached_tools)
+      self._hf_agent.cached_tools = resolve_tools(code, self._hf_agent.toolbox, remote=remote, cached_tools=self._hf_agent.cached_tools)
      return evaluate(code, self._hf_agent.cached_tools, state=kwargs.copy())
    else:
      tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote)
      return f'{tool_code}\n{code}'

 class BaseClient(_Client):
-
  def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str:
    raise NotImplementedError

  def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput:
-    return openllm_core.EmbeddingsOutput(
-        **self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt)))
+    return openllm_core.EmbeddingsOutput(**self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt)))

  def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str:
    return self.query(prompt, **attrs)

-  def query(self,
-            prompt: str,
-            return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed',
-            **attrs: t.Any) -> t.Any:
+  def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any:
    return_raw_response = attrs.pop('return_raw_response', None)
    if return_raw_response is not None:
      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
@@ -306,32 +279,27 @@ class BaseClient(_Client):
      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
      if return_attrs is True: return_response = 'attrs'
    use_default_prompt_template = attrs.pop('use_default_prompt_template', False)
-    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(
-        prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-    r = openllm_core.GenerationOutput(**self.call(
-        'generate',
-        openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(
-            **generate_kwargs)).model_dump()))
+    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt,
+                                                                                  use_default_prompt_template=use_default_prompt_template,
+                                                                                  **attrs)
+    r = openllm_core.GenerationOutput(
+        **self.call('generate',
+                    openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
    if return_response == 'attrs': return r
    elif return_response == 'raw': return bentoml_cattr.unstructure(r)
    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)

 class BaseAsyncClient(_AsyncClient):
-
  async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str:
    raise NotImplementedError

  async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput:
-    return openllm_core.EmbeddingsOutput(
-        **(await self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt))))
+    return openllm_core.EmbeddingsOutput(**(await self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt))))

  async def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
    return await self.query(prompt, **attrs)

-  async def query(self,
-                  prompt: str,
-                  return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed',
-                  **attrs: t.Any) -> t.Any:
+  async def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any:
    return_raw_response = attrs.pop('return_raw_response', None)
    if return_raw_response is not None:
      logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
@@ -341,12 +309,12 @@ class BaseAsyncClient(_AsyncClient):
      logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
      if return_attrs is True: return_response = 'attrs'
    use_default_prompt_template = attrs.pop('use_default_prompt_template', False)
-    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(
-        prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
-    r = openllm_core.GenerationOutput(**(await self.call(
-        'generate',
-        openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(
-            **generate_kwargs)).model_dump())))
+    prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt,
+                                                                                  use_default_prompt_template=use_default_prompt_template,
+                                                                                  **attrs)
+    r = openllm_core.GenerationOutput(
+        **(await self.call('generate',
+                           openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
    if return_response == 'attrs': return r
    elif return_response == 'raw': return bentoml_cattr.unstructure(r)
    else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
--- a/openllm-client/src/openllm_client/benmin/_grpc.py
+++ b/openllm-client/src/openllm_client/benmin/_grpc.py
@@ -69,16 +69,10 @@ def dispatch_channel(server_url: str,
  credentials = None
  if ssl:
    if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
-    credentials = grpc.ssl_channel_credentials(**{
-        k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()
-    })
+    credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()})

  if typ == 'async' and ssl:
-    return aio.secure_channel(server_url,
-                              credentials=credentials,
-                              options=options,
-                              compression=compression,
-                              interceptors=interceptors)
+    return aio.secure_channel(server_url, credentials=credentials, options=options, compression=compression, interceptors=interceptors)
  elif typ == 'async':
    return aio.insecure_channel(server_url, options=options, compression=compression, interceptors=interceptors)
  elif typ == 'sync' and ssl:
@@ -109,21 +103,12 @@ class GrpcClient(Client):
  def inner(self) -> grpc.Channel:
    if self.ssl:
      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
-      credentials = grpc.ssl_channel_credentials(**{
-          k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()
-      })
-      return grpc.secure_channel(self.server_url,
-                                 credentials=credentials,
-                                 options=self.options,
-                                 compression=self.compression)
+      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
+      return grpc.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression)
    return grpc.insecure_channel(self.server_url, options=self.options, compression=self.compression)

  @staticmethod
-  def wait_until_server_ready(host: str,
-                              port: int,
-                              timeout: float = 30,
-                              check_interval: int = 1,
-                              **kwargs: t.Any) -> None:
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
    with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}",
                          typ='sync',
                          options=kwargs.get('options', None),
@@ -167,18 +152,17 @@ class GrpcClient(Client):
    reflection = bentoml.Service(metadata.name)
    for api in metadata.apis:
      try:
-        reflection.apis[api.name] = InferenceAPI[t.Any](
-            None,
-            bentoml.io.from_spec({
-                'id': api.input.descriptor_id,
-                'args': json_format.MessageToDict(api.input.attributes).get('args', None)
-            }),
-            bentoml.io.from_spec({
-                'id': api.output.descriptor_id,
-                'args': json_format.MessageToDict(api.output.attributes).get('args', None)
-            }),
-            name=api.name,
-            doc=api.docs)
+        reflection.apis[api.name] = InferenceAPI[t.Any](None,
+                                                        bentoml.io.from_spec({
+                                                            'id': api.input.descriptor_id,
+                                                            'args': json_format.MessageToDict(api.input.attributes).get('args', None)
+                                                        }),
+                                                        bentoml.io.from_spec({
+                                                            'id': api.output.descriptor_id,
+                                                            'args': json_format.MessageToDict(api.output.attributes).get('args', None)
+                                                        }),
+                                                        name=api.name,
+                                                        doc=api.docs)
      except Exception as e:
        logger.error('Failed to instantiate client for API %s: ', api.name, e)
    return cls(url, reflection, **kwargs)
@@ -187,24 +171,16 @@ class GrpcClient(Client):
    return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=''))

  def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    channel_kwargs = {
-        k: kwargs.pop(f'_grpc_channel_{k}', None)
-        for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}
-    }
+    channel_kwargs = {k: kwargs.pop(f'_grpc_channel_{k}', None) for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}}
    if _inference_api.multi_input:
      if data is not None:
-        raise ValueError(
-            f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
      fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs))
    else:
      fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data))
    api_fn = {v: k for k, v in self.svc.apis.items()}
    stubs = services.BentoServiceStub(self.inner)
-    proto = stubs.Call(
-        pb.Request(**{
-            'api_name': api_fn[_inference_api],
-            _inference_api.input.proto_fields[0]: fake_resp
-        }), **channel_kwargs)
+    proto = stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
    return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content'))))

 class AsyncGrpcClient(AsyncClient):
@@ -230,25 +206,16 @@ class AsyncGrpcClient(AsyncClient):
  def inner(self) -> aio.Channel:
    if self.ssl:
      if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
-      credentials = grpc.ssl_channel_credentials(**{
-          k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()
-      })
+      credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
      return aio.secure_channel(self.server_url,
                                credentials=credentials,
                                options=self.options,
                                compression=self.compression,
                                interceptors=self.interceptors)
-    return aio.insecure_channel(self.server_url,
-                                options=self.options,
-                                compression=self.compression,
-                                interceptors=self.interceptors)
+    return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors)

  @staticmethod
-  async def wait_until_server_ready(host: str,
-                                    port: int,
-                                    timeout: float = 30,
-                                    check_interval: int = 1,
-                                    **kwargs: t.Any) -> None:
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
    async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}",
                                typ='async',
                                options=kwargs.get('options', None),
@@ -293,18 +260,17 @@ class AsyncGrpcClient(AsyncClient):
    reflection = bentoml.Service(metadata.name)
    for api in metadata.apis:
      try:
-        reflection.apis[api.name] = InferenceAPI[t.Any](
-            None,
-            bentoml.io.from_spec({
-                'id': api.input.descriptor_id,
-                'args': json_format.MessageToDict(api.input.attributes).get('args', None)
-            }),
-            bentoml.io.from_spec({
-                'id': api.output.descriptor_id,
-                'args': json_format.MessageToDict(api.output.attributes).get('args', None)
-            }),
-            name=api.name,
-            doc=api.docs)
+        reflection.apis[api.name] = InferenceAPI[t.Any](None,
+                                                        bentoml.io.from_spec({
+                                                            'id': api.input.descriptor_id,
+                                                            'args': json_format.MessageToDict(api.input.attributes).get('args', None)
+                                                        }),
+                                                        bentoml.io.from_spec({
+                                                            'id': api.output.descriptor_id,
+                                                            'args': json_format.MessageToDict(api.output.attributes).get('args', None)
+                                                        }),
+                                                        name=api.name,
+                                                        doc=api.docs)
      except Exception as e:
        logger.error('Failed to instantiate client for API %s: ', api.name, e)
    return cls(url, reflection, **kwargs)
@@ -313,25 +279,17 @@ class AsyncGrpcClient(AsyncClient):
    return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=''))

  async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
-    channel_kwargs = {
-        k: kwargs.pop(f'_grpc_channel_{k}', None)
-        for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}
-    }
+    channel_kwargs = {k: kwargs.pop(f'_grpc_channel_{k}', None) for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}}
    state = self.inner.get_state(try_to_connect=True)
    if state != grpc.ChannelConnectivity.READY: await self.inner.channel_ready()
    if _inference_api.multi_input:
      if data is not None:
-        raise ValueError(
-            f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
      fake_resp = await _inference_api.input.to_proto(kwargs)
    else:
      fake_resp = await _inference_api.input.to_proto(data)
    api_fn = {v: k for k, v in self.svc.apis.items()}
    async with self.inner:
      stubs = services.BentoServiceStub(self.inner)
-      proto = await stubs.Call(
-          pb.Request(**{
-              'api_name': api_fn[_inference_api],
-              _inference_api.input.proto_fields[0]: fake_resp
-          }), **channel_kwargs)
+      proto = await stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
    return await _inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content')))
--- a/openllm-client/src/openllm_client/benmin/_http.py
+++ b/openllm-client/src/openllm_client/benmin/_http.py
@@ -24,18 +24,13 @@ from openllm_core.utils import ensure_exec_coro
 logger = logging.getLogger(__name__)

 class HttpClient(Client):
-
  @functools.cached_property
  def inner(self) -> httpx.Client:
    if not urlparse(self.server_url).netloc: raise ValueError(f'Invalid server url: {self.server_url}')
    return httpx.Client(base_url=self.server_url)

  @staticmethod
-  def wait_until_server_ready(host: str,
-                              port: int,
-                              timeout: float = 30,
-                              check_interval: int = 1,
-                              **kwargs: t.Any) -> None:
+  def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
    host = host if '://' in host else 'http://' + host
    logger.debug('Waiting for server @ `%s:%d` to be ready...', host, port)
    start = time.time()
@@ -63,9 +58,7 @@ class HttpClient(Client):
    url = url if '://' in url else 'http://' + url
    resp = httpx.get(f'{url}/docs.json')
    if resp.status_code != 200:
-      raise ValueError(
-          f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}'
-      )
+      raise ValueError(f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}')
    _spec = orjson.loads(resp.content)

    reflection = bentoml.Service(_spec['info']['title'])
@@ -96,8 +89,7 @@ class HttpClient(Client):
    kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_grpc_')}
    if _inference_api.multi_input:
      if data is not None:
-        raise ValueError(
-            f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
      fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None))
    else:
      fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None))
@@ -106,8 +98,7 @@ class HttpClient(Client):
    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
    else: body = fake_resp.body

-    resp = self.inner.post('/' +
-                           _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route,
+    resp = self.inner.post('/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route,
                           data=body,
                           headers={'content-type': fake_resp.headers['content-type']},
                           timeout=self.timeout)
@@ -120,18 +111,13 @@ class HttpClient(Client):
    return ensure_exec_coro(_inference_api.output.from_http_request(fake_req))

 class AsyncHttpClient(AsyncClient):
-
  @functools.cached_property
  def inner(self) -> httpx.AsyncClient:
    if not urlparse(self.server_url).netloc: raise ValueError(f'Invalid server url: {self.server_url}')
    return httpx.AsyncClient(base_url=self.server_url)

  @staticmethod
-  async def wait_until_server_ready(host: str,
-                                    port: int,
-                                    timeout: float = 30,
-                                    check_interval: int = 1,
-                                    **kwargs: t.Any) -> None:
+  async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
    host = host if '://' in host else 'http://' + host
    logger.debug('Waiting for server @ `%s:%d` to be ready...', host, port)
    start = time.time()
@@ -148,8 +134,7 @@ class AsyncHttpClient(AsyncClient):
    async with httpx.AsyncClient(base_url=f'{host}:{port}') as sess:
      resp = await sess.get('/readyz')
      if resp.status_code != 200:
-        raise TimeoutError(
-            f'Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}')
+        raise TimeoutError(f'Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}')

  async def health(self) -> httpx.Response:
    return await self.inner.get('/readyz')
@@ -160,9 +145,7 @@ class AsyncHttpClient(AsyncClient):
    async with httpx.AsyncClient(base_url=url) as session:
      resp = await session.get('/docs.json')
      if resp.status_code != 200:
-        raise ValueError(
-            f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}'
-        )
+        raise ValueError(f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}')
      _spec = orjson.loads(await resp.aread())

    reflection = bentoml.Service(_spec['info']['title'])
@@ -193,8 +176,7 @@ class AsyncHttpClient(AsyncClient):
    kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_grpc_')}
    if _inference_api.multi_input:
      if data is not None:
-        raise ValueError(
-            f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
+        raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
      fake_resp = await _inference_api.input.to_http_response(kwargs, None)
    else:
      fake_resp = await _inference_api.input.to_http_response(data, None)
@@ -203,11 +185,10 @@ class AsyncHttpClient(AsyncClient):
    if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
    else: body = t.cast(t.Any, fake_resp.body)

-    resp = await self.inner.post(
-        '/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route,
-        data=body,
-        headers={'content-type': fake_resp.headers['content-type']},
-        timeout=self.timeout)
+    resp = await self.inner.post('/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route,
+                                 data=body,
+                                 headers={'content-type': fake_resp.headers['content-type']},
+                                 timeout=self.timeout)
    if resp.status_code != 200: raise ValueError(f'Error making request: {resp.status_code}: {(await resp.aread())!s}')
    fake_req = starlette.requests.Request(scope={'type': 'http'})
    headers = starlette.datastructures.Headers(headers=resp.headers)
--- a/openllm-client/src/openllm_client/client.py
+++ b/openllm-client/src/openllm_client/client.py
@@ -16,25 +16,21 @@ def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> No
  else: self._port = next(iter(_port))

 class HTTPClient(BaseClient):
-
  def __init__(self, address: str, timeout: int = 30):
    process_http_address(self, address)
    super().__init__(address, timeout)

 class AsyncHTTPClient(BaseAsyncClient):
-
  def __init__(self, address: str, timeout: int = 30):
    process_http_address(self, address)
    super().__init__(address, timeout)

 class GrpcClient(BaseClient):
-
  def __init__(self, address: str, timeout: int = 30):
    self._host, self._port = address.split(':')
    super().__init__(address, timeout)

 class AsyncGrpcClient(BaseAsyncClient):
-
  def __init__(self, address: str, timeout: int = 30):
    self._host, self._port = address.split(':')
    super().__init__(address, timeout)
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -104,7 +104,6 @@ config_merger = Merger([(dict, 'merge')], ['override'], ['override'])

 # case insensitive, but rename to conform with type
 class _PeftEnumMeta(enum.EnumMeta):
-
  def __getitem__(self, __key: str | t.Any, /) -> t.Any:
    if isinstance(__key, str): __key = inflection.underscore(__key).upper()
    return self._member_map_[__key]
@@ -198,35 +197,26 @@ class FineTuneConfig:

  adapter_type: PeftType = dantic.Field(
      'lora',
-      description=
-      f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'",
+      description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'",
      use_default_converter=False,
      converter=_adapter_converter)
-  adapter_config: t.Dict[str, t.Any] = dantic.Field(
-      None,
-      description='The configuration for the adapter. The content of the dict depends on the adapter type.',
-      validator=attr.validators.optional(attr.validators.instance_of(dict)),
-      converter=attr.converters.default_if_none(factory=dict),
-      use_default_converter=False)
-  inference_mode: bool = dantic.Field(False,
-                                      description='Whether to use this Adapter for inference',
-                                      use_default_converter=False)
-  llm_config_class: type[LLMConfig] = dantic.Field(None,
-                                                   description='The reference class to openllm.LLMConfig',
-                                                   use_default_converter=False)
+  adapter_config: t.Dict[str,
+                         t.Any] = dantic.Field(None,
+                                               description='The configuration for the adapter. The content of the dict depends on the adapter type.',
+                                               validator=attr.validators.optional(attr.validators.instance_of(dict)),
+                                               converter=attr.converters.default_if_none(factory=dict),
+                                               use_default_converter=False)
+  inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False)
+  llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False)

  def to_peft_config(self) -> peft.PeftConfig:  # type: ignore[name-defined]
    adapter_config = self.adapter_config.copy()
    # no need for peft_type since it is internally managed by OpenLLM and PEFT
    if 'peft_type' in adapter_config: adapter_config.pop('peft_type')
    # respect user set task_type if it is passed, otherwise use one managed by OpenLLM
-    task_type, inference_mode = adapter_config.pop(
-        'task_type',
-        peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop('inference_mode',
-                                                                                   self.inference_mode)
-    return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type,
-                                                                        inference_mode=inference_mode,
-                                                                        **adapter_config)
+    task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop(
+        'inference_mode', self.inference_mode)
+    return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type, inference_mode=inference_mode, **adapter_config)

  def train(self) -> FineTuneConfig:
    _object_setattr(self, 'inference_mode', False)
@@ -237,14 +227,10 @@ class FineTuneConfig:
    return self

  def with_config(self, **attrs: t.Any) -> FineTuneConfig:
-    adapter_type, inference_mode = attrs.pop('adapter_type',
-                                             self.adapter_type), attrs.get('inference_mode', self.inference_mode)
+    adapter_type, inference_mode = attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode)
    if 'llm_config_class' in attrs:
      raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.")
-    return attr.evolve(self,
-                       adapter_type=adapter_type,
-                       inference_mode=inference_mode,
-                       adapter_config=config_merger.merge(self.adapter_config, attrs))
+    return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs))

@attr.frozen(slots=True, repr=False, init=False)
 class GenerationConfig(ReprMixin):
@@ -254,16 +240,14 @@ class GenerationConfig(ReprMixin):
  to be used conjunction with LLMConfig. The instance of the generation config can then be accessed
  via ``LLMConfig.generation_config``.
  '''
-  max_new_tokens: int = dantic.Field(
-      20, ge=0, description='The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
+  max_new_tokens: int = dantic.Field(20, ge=0, description='The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
  min_length: int = dantic.Field(
      0,
      ge=0,
      description=
      'The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.'
  )
-  min_new_tokens: int = dantic.Field(
-      description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
+  min_new_tokens: int = dantic.Field(description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.')
  early_stopping: bool = dantic.Field(
      False,
      description=
@@ -280,24 +264,15 @@ class GenerationConfig(ReprMixin):
      'Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.'
  )
  penalty_alpha: float = dantic.Field(
-      description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.'
-  )
+      description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.')
  use_cache: bool = dantic.Field(
-      True,
-      description=
-      'Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.'
-  )
-  temperature: float = dantic.Field(1.0,
-                                    ge=0.0,
-                                    le=1.0,
-                                    description='The value used to modulate the next token probabilities.')
-  top_k: int = dantic.Field(
-      50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.')
+      True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.')
+  temperature: float = dantic.Field(1.0, ge=0.0, le=1.0, description='The value used to modulate the next token probabilities.')
+  top_k: int = dantic.Field(50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.')
  top_p: float = dantic.Field(
      1.0,
      description=
-      'If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.'
-  )
+      'If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.')
  typical_p: float = dantic.Field(
      1.0,
      description=
@@ -320,21 +295,18 @@ class GenerationConfig(ReprMixin):
  )
  repetition_penalty: float = dantic.Field(
      1.0,
-      description=
-      'The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.'
+      description='The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.'
  )
  encoder_repetition_penalty: float = dantic.Field(
      1.0,
      description=
-      'The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.'
-  )
+      'The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.')
  length_penalty: float = dantic.Field(
      1.0,
      description=
      'Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences.'
  )
-  no_repeat_ngram_size: int = dantic.Field(
-      0, description='If set to int > 0, all ngrams of that size can only occur once.')
+  no_repeat_ngram_size: int = dantic.Field(0, description='If set to int > 0, all ngrams of that size can only occur once.')
  bad_words_ids: t.List[t.List[int]] = dantic.Field(
      description=
      'List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.'
@@ -381,35 +353,22 @@ class GenerationConfig(ReprMixin):
      description=
      'A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123.'
  )
-  num_return_sequences: int = dantic.Field(
-      1, description='The number of independently computed returned sequences for each element in the batch.')
+  num_return_sequences: int = dantic.Field(1, description='The number of independently computed returned sequences for each element in the batch.')
  output_attentions: bool = dantic.Field(
      False,
-      description=
-      'Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.'
-  )
+      description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.')
  output_hidden_states: bool = dantic.Field(
-      False,
-      description=
-      'Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.'
-  )
+      False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.')
  output_scores: bool = dantic.Field(
-      False,
-      description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.'
-  )
+      False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.')
  pad_token_id: int = dantic.Field(description='The id of the *padding* token.')
  bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.')
  eos_token_id: t.Union[int, t.List[int]] = dantic.Field(
-      description=
-      'The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.')
+      description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.')
  encoder_no_repeat_ngram_size: int = dantic.Field(
-      0,
-      description=
-      'If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.'
-  )
+      0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.')
  decoder_start_token_id: int = dantic.Field(
-      description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.'
-  )
+      description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.')

  if t.TYPE_CHECKING and not MYPY:
    # stubs this for pyright as mypy already has a attr plugin builtin
@@ -418,9 +377,7 @@ class GenerationConfig(ReprMixin):

  def __init__(self, *, _internal: bool = False, **attrs: t.Any):
    if not _internal:
-      raise RuntimeError(
-          'GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config'
-      )
+      raise RuntimeError('GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config')
    self.__attrs_init__(**attrs)

  def __getitem__(self, item: str) -> t.Any:
@@ -438,9 +395,7 @@ bentoml_cattr.register_unstructure_hook_factory(
                                         _cattrs_omit_if_default=False,
                                         _cattrs_use_linecache=True,
                                         **{
-                                             k: override(omit=True)
-                                             for k, v in attr.fields_dict(cls).items()
-                                             if v.default in (None, attr.NOTHING)
+                                             k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
                                         }))

@attr.frozen(slots=True, repr=False, init=False)
@@ -471,13 +426,8 @@ class SamplingParams(ReprMixin):
  )
  use_beam_search: bool = dantic.Field(False, description='Whether to use beam search instead of sampling.')
  stop: t.List[str] = dantic.Field(
-      None,
-      description=
-      'List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.'
-  )
-  ignore_eos: bool = dantic.Field(
-      False,
-      description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.')
+      None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.')
+  ignore_eos: bool = dantic.Field(False, description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.')
  logprobs: int = dantic.Field(None, description='Number of log probabilities to return per output token.')

  if t.TYPE_CHECKING:
@@ -526,9 +476,7 @@ class SamplingParams(ReprMixin):
    temperature = first_not_none(attrs.pop('temperature', None), default=generation_config['temperature'])
    top_k = first_not_none(attrs.pop('top_k', None), default=generation_config['top_k'])
    top_p = first_not_none(attrs.pop('top_p', None), default=generation_config['top_p'])
-    max_tokens = first_not_none(attrs.pop('max_tokens', None),
-                                attrs.pop('max_new_tokens', None),
-                                default=generation_config['max_new_tokens'])
+    max_tokens = first_not_none(attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens'])
    return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs)

 bentoml_cattr.register_unstructure_hook_factory(
@@ -538,13 +486,11 @@ bentoml_cattr.register_unstructure_hook_factory(
                                         _cattrs_omit_if_default=False,
                                         _cattrs_use_linecache=True,
                                         **{
-                                             k: override(omit=True)
-                                             for k, v in attr.fields_dict(cls).items()
-                                             if v.default in (None, attr.NOTHING)
+                                             k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)
                                         }))
 bentoml_cattr.register_structure_hook_factory(
-    lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_structure_fn(
-        cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')))
+    lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams),
+    lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens')))

 # cached it here to save one lookup per assignment
 _object_getattribute = object.__getattribute__
@@ -607,12 +553,10 @@ _transformed_type: DictStrAny = {
                                  use_default_converter=False,
                                  type=_transformed_type.get(k, ann),
                                  metadata={'target': f'__openllm_{k}__'},
-                                  description=f'ModelSettings field for {k}.'))
-                 for k, ann in t.get_type_hints(ModelSettings).items()
+                                  description=f'ModelSettings field for {k}.')) for k, ann in t.get_type_hints(ModelSettings).items()
             ])
 class _ModelSettingsAttr:
  '''Internal attrs representation of ModelSettings.'''
-
  def __getitem__(self, key: str) -> t.Any:
    if key in codegen.get_annotations(ModelSettings):
      return _object_getattribute(self, key)
@@ -676,8 +620,7 @@ def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBacken
 def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr:
  if 'generation_class' in cl_.__config__:
    raise ValueError(
-        f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead."
-    )
+        f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.")

  required_fields = {k for k, ann in t.get_type_hints(ModelSettings).items() if t.get_origin(ann) is Required}
  if any(i not in cl_.__config__ for i in required_fields):
@@ -689,11 +632,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
  _final_value_dct: DictStrAny = {}

  if not has_custom_name:
-    _final_value_dct['model_name'] = inflection.underscore(
-        _cl_name) if _settings_attr['name_type'] == 'dasherize' else _cl_name.lower()
+    _final_value_dct['model_name'] = inflection.underscore(_cl_name) if _settings_attr['name_type'] == 'dasherize' else _cl_name.lower()
    _final_value_dct['start_name'] = inflection.dasherize(
-        _final_value_dct['model_name']
-    ) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name']
+        _final_value_dct['model_name']) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name']

  model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name
  # if the default implementation dependencies doesn't exist, then always fallback to 'pt'
@@ -703,9 +644,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
    if not BACKENDS_MAPPING[library_stub][0](): default_backend[rs] = 'pt'
  _final_value_dct['default_backend'] = default_backend

-  env = openllm_core.utils.EnvVarMixin(model_name,
-                                       backend=get_default_backend(default_backend),
-                                       model_id=_settings_attr.default_id)
+  env = openllm_core.utils.EnvVarMixin(model_name, backend=get_default_backend(default_backend), model_id=_settings_attr.default_id)
  _final_value_dct['env'] = env

  _final_value_dct['service_name'] = f'generated_{model_name}_service.py'
@@ -729,16 +668,10 @@ bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings)
 def _setattr_class(attr_name: str, value_var: t.Any) -> str:
  return f"setattr(cls, '{attr_name}', {value_var})"

-def _make_assignment_script(cls: type[LLMConfig],
-                            attributes: attr.AttrsInstance,
-                            _prefix: LiteralString = 'openllm') -> t.Callable[..., None]:
+def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: LiteralString = 'openllm') -> t.Callable[..., None]:
  '''Generate the assignment script with prefix attributes __openllm_<value>__.'''
  args: ListStr = []
-  globs: DictStrAny = {
-      'cls': cls,
-      '_cached_attribute': attributes,
-      '_cached_getattribute_get': _object_getattribute.__get__
-  }
+  globs: DictStrAny = {'cls': cls, '_cached_attribute': attributes, '_cached_getattribute_get': _object_getattribute.__get__}
  annotations: DictStrAny = {'return': None}

  lines: ListStr = []
@@ -748,18 +681,12 @@ def _make_assignment_script(cls: type[LLMConfig],
    lines.append(_setattr_class(arg_name, attr_name))
    annotations[attr_name] = field.type

-  return codegen.generate_function(cls,
-                                   '__assign_attr',
-                                   lines,
-                                   args=('cls', *args),
-                                   globs=globs,
-                                   annotations=annotations)
+  return codegen.generate_function(cls, '__assign_attr', lines, args=('cls', *args), globs=globs, annotations=annotations)

 _reserved_namespace = {'__config__', 'GenerationConfig', 'SamplingParams'}

@attr.define(slots=True)
 class _ConfigAttr:
-
  @staticmethod
  def Field(default: t.Any = None, **attrs: t.Any) -> t.Any:
    '''Field is a alias to the internal dantic utilities to easily create
@@ -825,7 +752,6 @@ class _ConfigAttr:
    '''The result generated SamplingParams class for this LLMConfig. This will be used
        to create arguments for vLLM LLMEngine that can be used throughout the lifecycle.
        This class will also be managed internally by OpenLLM.'''
-
    def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
      '''Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract.'''

@@ -917,8 +843,7 @@ class _ConfigBuilder:
  It takes `these` arguments as a fully parsed attr.Attribute[t.Any] from __init_subclass__
  """

-  __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names',
-               '_has_pre_init', '_has_post_init')
+  __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', '_has_pre_init', '_has_post_init')

  def __init__(self,
               cls: type[LLMConfig],
@@ -931,8 +856,7 @@ class _ConfigBuilder:
                                                        auto_attribs,
                                                        kw_only,
                                                        collect_by_mro,
-                                                        field_transformer=codegen.make_env_transformer(
-                                                            cls, cls.__openllm_model_name__))
+                                                        field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__))
    self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict(
        cls.__dict__), attrs, {a.name for a in base_attrs}, base_attr_map
    self._attr_names = tuple(a.name for a in attrs)
@@ -957,13 +881,11 @@ class _ConfigBuilder:
    existing_slots: DictStrAny = {}
    for base_cls in self._cls.__mro__[1:-1]:
      if base_cls.__dict__.get('__weakref__', None) is not None: weakref_inherited = True
-      existing_slots.update(
-          {name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, '__slots__', [])})
+      existing_slots.update({name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, '__slots__', [])})

    names = self._attr_names
    base_names = set(self._base_names)
-    if '__weakref__' not in getattr(self._cls, '__slots__',
-                                    ()) and '__weakref__' not in names and not weakref_inherited:
+    if '__weakref__' not in getattr(self._cls, '__slots__', ()) and '__weakref__' not in names and not weakref_inherited:
      names += ('__weakref__',)
    # We only add the names of attributes that aren't inherited.
    # Setting __slots__ to inherited attributes wastes memory.
@@ -1022,16 +944,14 @@ class _ConfigBuilder:
  def add_attrs_init(self) -> Self:
    self._cls_dict['__attrs_init__'] = codegen.add_method_dunders(
        self._cls,
-        _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False,
-                   self._base_attr_map, False, None, True))
+        _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True))
    return self

  def add_repr(self) -> Self:
    for key, fn in ReprMixin.__dict__.items():
      if key in ('__repr__', '__str__', '__repr_name__', '__repr_str__', '__repr_args__'):
        self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
-    self._cls_dict['__repr_keys__'] = property(
-        lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'})
+    self._cls_dict['__repr_keys__'] = property(lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'})
    return self

@attr.define(slots=True, init=False)
@@ -1124,7 +1044,6 @@ class LLMConfig(_ConfigAttr):
  Future work:
  - Support pydantic-core as validation backend.
  """
-
  def __init_subclass__(cls, **_: t.Any):
    """The purpose of this ``__init_subclass__`` is to offer pydantic UX while adhering to attrs contract.

@@ -1144,10 +1063,7 @@ class LLMConfig(_ConfigAttr):
    # auto assignment attributes generated from __config__ after create the new slot class.
    _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls)

-    def _make_subclass(class_attr: str,
-                       base: type[At],
-                       globs: dict[str, t.Any] | None = None,
-                       suffix_env: LiteralString | None = None) -> type[At]:
+    def _make_subclass(class_attr: str, base: type[At], globs: dict[str, t.Any] | None = None, suffix_env: LiteralString | None = None) -> type[At]:
      camel_name = cls.__name__.replace('Config', '')
      klass = attr.make_class(f'{camel_name}{class_attr}', [],
                              bases=(base,),
@@ -1162,8 +1078,7 @@ class LLMConfig(_ConfigAttr):
                                  cls.__openllm_model_name__,
                                  suffix=suffix_env,
                                  globs=globs,
-                                  default_callback=lambda field_name, field_default: getattr(
-                                      getattr(cls, class_attr), field_name, field_default)
+                                  default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default)
                                  if codegen.has_own_attribute(cls, class_attr) else field_default))
      # For pickling to work, the __module__ variable needs to be set to the
      # frame where the class is created. This respect the module that is created from cls
@@ -1195,13 +1110,11 @@ class LLMConfig(_ConfigAttr):
    unannotated = ca_names - annotated_names
    if len(unannotated) > 0:
      missing_annotated = sorted(unannotated, key=lambda n: t.cast('_CountingAttr', cd.get(n)).counter)
-      raise openllm_core.exceptions.MissingAnnotationAttributeError(
-          f"The following field doesn't have a type annotation: {missing_annotated}")
+      raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
    # We need to set the accepted key before generation_config
    # as generation_config is a special field that users shouldn't pass.
-    cls.__openllm_accepted_keys__ = set(these.keys()) | {
-        a.name for a in attr.fields(cls.__openllm_generation_class__)
-    } | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
+    cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)
+                                                         } | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
    cls = _ConfigBuilder(cls, these).add_attrs_init().add_repr().build_class()

    # Finally, resolve the types
@@ -1214,10 +1127,9 @@ class LLMConfig(_ConfigAttr):
      cls = attr.resolve_types(cls, globalns=globs)
    # the hint cache for easier access
    cls.__openllm_hints__ = {
-        f.name: f.type for ite in
-        [attr.fields(cls),
-         attr.fields(cls.__openllm_generation_class__),
-         attr.fields(cls.__openllm_sampling_class__),] for f in ite
+        f.name: f.type
+        for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__),
+                    attr.fields(cls.__openllm_sampling_class__)] for f in ite
    }

    # for pickling to work, need to set the module to the correct outer frame
@@ -1233,19 +1145,13 @@ class LLMConfig(_ConfigAttr):
      )
    super().__setattr__(attr, value)

-  def __init__(self,
-               *,
-               generation_config: DictStrAny | None = None,
-               __openllm_extras__: DictStrAny | None = None,
-               **attrs: t.Any):
+  def __init__(self, *, generation_config: DictStrAny | None = None, __openllm_extras__: DictStrAny | None = None, **attrs: t.Any):
    # create a copy of the keys as cache
    _cached_keys = tuple(attrs.keys())
    _generation_cl_dict = attr.fields_dict(self.__openllm_generation_class__)
    if generation_config is None: generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
    else:
-      generation_config = config_merger.merge(generation_config, {
-          k: v for k, v in attrs.items() if k in _generation_cl_dict
-      })
+      generation_config = config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})

    sampling_config = {k: v for k, v in attrs.items() if k in attr.fields_dict(self.__openllm_sampling_class__)}
    for k in _cached_keys:
@@ -1432,8 +1338,7 @@ class LLMConfig(_ConfigAttr):
    if item is None: raise TypeError(f"{self} doesn't understand how to index None.")
    item = inflection.underscore(item)
    if item in _reserved_namespace:
-      raise ForbiddenAttributeError(
-          f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified.")
+      raise ForbiddenAttributeError(f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified.")
    internal_attributes = f'__openllm_{item}__'
    if hasattr(self, internal_attributes): return getattr(self, internal_attributes)
    elif hasattr(self, item): return getattr(self, item)
@@ -1448,8 +1353,7 @@ class LLMConfig(_ConfigAttr):

  def __getattribute__(self, item: str) -> t.Any:
    if item in _reserved_namespace:
-      raise ForbiddenAttributeError(
-          f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.")
+      raise ForbiddenAttributeError(f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.")
    return _object_getattribute.__get__(self)(item)

  def __len__(self) -> int:
@@ -1461,13 +1365,12 @@ class LLMConfig(_ConfigAttr):
  def values(self) -> list[t.Any]:
    return ([getattr(self, k.name) for k in attr.fields(self.__class__)] +
            [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] +
-            [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] +
-            list(self.__openllm_extras__.values()))
+            [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values()))

  def items(self) -> list[tuple[str, t.Any]]:
-    return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [
-        (k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)
-    ] + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] +
+    return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] +
+            [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] +
+            [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] +
            list(self.__openllm_extras__.items()))

  def __iter__(self) -> t.Iterator[str]:
@@ -1617,8 +1520,7 @@ class LLMConfig(_ConfigAttr):
      f = dantic.attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty, suffix_sampling=True)(f)
    f = cog.optgroup.group(f'{cls.__openllm_sampling_class__.__name__} sampling options')(f)

-    total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set(
-        attr.fields_dict(cls.__openllm_sampling_class__))
+    total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set(attr.fields_dict(cls.__openllm_sampling_class__))

    if len(cls.__openllm_accepted_keys__.difference(total_keys)) == 0: return t.cast('click.Command', f)
    # We pop out 'generation_config' as it is a attribute that we don't need to expose to CLI.
@@ -1637,8 +1539,7 @@ class LLMConfig(_ConfigAttr):

  @classmethod
  def default_backend(cls) -> LiteralBackend:
-    return first_not_none(cls.__openllm_env__['backend_value'],
-                          default=get_default_backend(cls.__openllm_default_backend__))
+    return first_not_none(cls.__openllm_env__['backend_value'], default=get_default_backend(cls.__openllm_default_backend__))

  def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
    '''This handler will sanitize all attrs and setup prompt text.
@@ -1694,6 +1595,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:

 bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
 openllm_home = os.path.expanduser(
-    os.environ.get(
-        'OPENLLM_HOME',
-        os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm')))
+    os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm')))
--- a/openllm-core/src/openllm_core/_prompt.py
+++ b/openllm-core/src/openllm_core/_prompt.py
@@ -4,13 +4,11 @@ import typing as t

 class PromptFormatter(string.Formatter):
  """This PromptFormatter is largely based on langchain's implementation."""
-
  def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> t.Any:
    if len(args) > 0: raise ValueError('Positional arguments are not supported')
    return super().vformat(format_string, args, kwargs)

-  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str,
-                                                                                                    t.Any]) -> None:
+  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
    extras = set(kwargs).difference(used_args)
    if extras: raise KeyError(f'Extra params passed: {extras}')

@@ -26,8 +24,7 @@ def process_prompt(prompt: str, template: str | None = None, use_prompt_template
  template_variables = default_formatter.extract_template_variables(template)
  prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
  if 'instruction' in prompt_variables:
-    raise RuntimeError(
-        "'instruction' should be passed as the first argument instead of kwargs when 'use_prompt_template=True'")
+    raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_prompt_template=True'")
  try:
    return template.format(instruction=prompt, **prompt_variables)
  except KeyError as e:
--- a/openllm-core/src/openllm_core/_schema.py
+++ b/openllm-core/src/openllm_core/_schema.py
@@ -21,11 +21,7 @@ class GenerationInput:
  adapter_name: str | None = attr.field(default=None)

  def model_dump(self) -> dict[str, t.Any]:
-    return {
-        'prompt': self.prompt,
-        'llm_config': self.llm_config.model_dump(flatten=True),
-        'adapter_name': self.adapter_name
-    }
+    return {'prompt': self.prompt, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name}

  @staticmethod
  def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig:
@@ -43,15 +39,11 @@ class GenerationInput:
  def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]:
    return attr.make_class(inflection.camelize(llm_config['model_name']) + 'GenerationInput',
                           attrs={
-                               'prompt':
-                                   attr.field(type=str),
-                               'llm_config':
-                                   attr.field(type=llm_config.__class__,
-                                              default=llm_config,
-                                              converter=functools.partial(cls.convert_llm_config,
-                                                                          cls=llm_config.__class__)),
-                               'adapter_name':
-                                   attr.field(default=None, type=str)
+                               'prompt': attr.field(type=str),
+                               'llm_config': attr.field(type=llm_config.__class__,
+                                                        default=llm_config,
+                                                        converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)),
+                               'adapter_name': attr.field(default=None, type=str)
                           })

@attr.frozen(slots=True)
--- a/openllm-core/src/openllm_core/_strategies.py
+++ b/openllm-core/src/openllm_core/_strategies.py
@@ -151,8 +151,7 @@ def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]:
  elif isinstance(spec, list):
    return [str(x) for x in spec]
  else:
-    raise TypeError(
-        f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
+    raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")

 def _raw_device_uuid_nvml() -> list[str] | None:
  from ctypes import CDLL
@@ -278,10 +277,8 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):

  TODO: Support CloudTPUResource
  """
-
  @classmethod
-  def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None,
-                       workers_per_resource: float) -> int:
+  def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float) -> int:
    '''Return the number of workers to be used for the given runnable class.

    Note that for all available GPU, the number of workers will always be 1.
@@ -313,8 +310,8 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
    )

  @classmethod
-  def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None,
-                     workers_per_resource: int | float, worker_index: int) -> dict[str, t.Any]:
+  def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float,
+                     worker_index: int) -> dict[str, t.Any]:
    '''Get worker env for this given worker_index.

    Args:
@@ -379,19 +376,15 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin):
      # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
      assigned_resource_per_worker = round(1 / workers_per_resource)
      if len(gpus) < assigned_resource_per_worker:
-        logger.warning(
-            'Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus,
-            worker_index, assigned_resource_per_worker)
-        raise IndexError(
-            f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
-        )
+        logger.warning('Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, worker_index,
+                       assigned_resource_per_worker)
+        raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
      assigned_gpu = gpus[assigned_resource_per_worker * worker_index:assigned_resource_per_worker * (worker_index + 1)]
      dev = ','.join(assigned_gpu)
    else:
      idx = worker_index // workers_per_resource
      if idx >= len(gpus):
-        raise ValueError(
-            f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
+        raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
      dev = str(gpus[idx])
    return dev

--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -31,11 +31,7 @@ M = t.TypeVar(
    bound=
    't.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]'
 )
-T = t.TypeVar(
-    'T',
-    bound=
-    't.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]'
-)
+T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]')

 def get_literal_args(typ: t.Any) -> tuple[str, ...]:
  return getattr(typ, '__args__')
@@ -132,7 +128,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
               max_latency_ms: int | None = ...,
               method_configs: dict[str, dict[str, int]] | None = ...,
               embedded: bool = False,
-              ) -> None:
+               ) -> None:
    ...

  def __call__(self, prompt: str, **attrs: t.Any) -> t.Any:
@@ -163,23 +159,19 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
    ...

 class load_model_protocol(t.Generic[M, T], t.Protocol):
-
  def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
    ...

 class load_tokenizer_protocol(t.Generic[M, T], t.Protocol):
-
  def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T:
    ...

 _R = t.TypeVar('_R', covariant=True)

 class import_model_protocol(t.Generic[_R, M, T], t.Protocol):
-
  def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R:
    ...

 class llm_post_init_protocol(t.Generic[M, T], t.Protocol):
-
  def __call__(self, llm: LLM[M, T]) -> T:
    ...
--- a/openllm-core/src/openllm_core/config/configuration_auto.py
+++ b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -24,14 +24,12 @@ if t.TYPE_CHECKING:
  ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]]

 # NOTE: This is the entrypoint when adding new model config
-CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'),
-                                    ('falcon', 'FalconConfig'), ('flan_t5', 'FlanT5Config'),
-                                    ('gpt_neox', 'GPTNeoXConfig'), ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'),
-                                    ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'),
-                                    ('starcoder', 'StarCoderConfig'), ('baichuan', 'BaichuanConfig')])
+CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'),
+                                    ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'),
+                                    ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'),
+                                    ('baichuan', 'BaichuanConfig')])

 class _LazyConfigMapping(OrderedDict, ReprMixin):
-
  def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]):
    self._mapping = mapping
    self._extra_content: dict[str, t.Any] = {}
@@ -79,32 +77,21 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):

 CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
 # The below handle special alias when we call underscore to the name directly without processing camelcase first.
-CONFIG_NAME_ALIASES: dict[str, str] = {
-    'chat_glm': 'chatglm',
-    'stable_lm': 'stablelm',
-    'star_coder': 'starcoder',
-    'gpt_neo_x': 'gpt_neox',
-}
+CONFIG_NAME_ALIASES: dict[str, str] = {'chat_glm': 'chatglm', 'stable_lm': 'stablelm', 'star_coder': 'starcoder', 'gpt_neo_x': 'gpt_neox'}

 class AutoConfig:
-
  def __init__(self, *_: t.Any, **__: t.Any):
-    raise EnvironmentError(
-        'Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.')
+    raise EnvironmentError('Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.')

  @classmethod
  def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig:
    model_name = inflection.underscore(model_name)
    if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
-    raise ValueError(
-        f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
-    )
+    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")

  @classmethod
  def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]:
    model_name = inflection.underscore(name)
    if model_name in CONFIG_NAME_ALIASES: model_name = CONFIG_NAME_ALIASES[model_name]
    if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name]
-    raise ValueError(
-        f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
-    )
+    raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
--- a/openllm-core/src/openllm_core/config/configuration_baichuan.py
+++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -37,24 +37,17 @@ class BaichuanConfig(openllm_core.LLMConfig):
  Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
  """
  __config__ = {
-      'name_type':
-          'lowercase',
-      'trust_remote_code':
-          True,
-      'timeout':
-          3600000,
-      'requires_gpu':
-          True,
-      'url':
-          'https://github.com/baichuan-inc/Baichuan-7B',
+      'name_type': 'lowercase',
+      'trust_remote_code': True,
+      'timeout': 3600000,
+      'requires_gpu': True,
+      'url': 'https://github.com/baichuan-inc/Baichuan-7B',
      'requirements': ['cpm-kernels', 'sentencepiece'],
-      'architecture':
-          'BaiChuanForCausalLM',
-      'default_id':
-          'baichuan-inc/baichuan-7b',
+      'architecture': 'BaiChuanForCausalLM',
+      'default_id': 'baichuan-inc/baichuan-7b',
      'model_ids': [
-          'baichuan-inc/baichuan-7b', 'baichuan-inc/baichuan-13b-base', 'baichuan-inc/baichuan-13b-chat',
-          'fireballoon/baichuan-vicuna-chinese-7b', 'fireballoon/baichuan-vicuna-7b', 'hiyouga/baichuan-7b-sft'
+          'baichuan-inc/baichuan-7b', 'baichuan-inc/baichuan-13b-base', 'baichuan-inc/baichuan-13b-chat', 'fireballoon/baichuan-vicuna-chinese-7b',
+          'fireballoon/baichuan-vicuna-7b', 'hiyouga/baichuan-7b-sft'
      ]
  }

--- a/openllm-core/src/openllm_core/config/configuration_chatglm.py
+++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py
@@ -41,30 +41,18 @@ class ChatGLMConfig(openllm_core.LLMConfig):
  Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
  """
  __config__ = {
-      'name_type':
-          'lowercase',
-      'trust_remote_code':
-          True,
-      'timeout':
-          3600000,
-      'requires_gpu':
-          True,
-      'url':
-          'https://github.com/THUDM/ChatGLM-6B',
+      'name_type': 'lowercase',
+      'trust_remote_code': True,
+      'timeout': 3600000,
+      'requires_gpu': True,
+      'url': 'https://github.com/THUDM/ChatGLM-6B',
      'requirements': ['cpm-kernels', 'sentencepiece'],
-      'architecture':
-          'ChatGLMForConditionalGeneration',
-      'default_id':
-          'thudm/chatglm-6b',
-      'model_ids': [
-          'thudm/chatglm-6b', 'thudm/chatglm-6b-int8', 'thudm/chatglm-6b-int4', 'thudm/chatglm2-6b',
-          'thudm/chatglm2-6b-int4'
-      ]
+      'architecture': 'ChatGLMForConditionalGeneration',
+      'default_id': 'thudm/chatglm-6b',
+      'model_ids': ['thudm/chatglm-6b', 'thudm/chatglm-6b-int8', 'thudm/chatglm-6b-int4', 'thudm/chatglm2-6b', 'thudm/chatglm2-6b-int4']
  }
  retain_history: bool = dantic.Field(
-      False,
-      description=
-      'Whether to retain history given to the model. If set to True, then the model will retain given history.')
+      False, description='Whether to retain history given to the model. If set to True, then the model will retain given history.')
  use_half_precision: bool = dantic.Field(True, description='Whether to use half precision for model.')

  class GenerationConfig:
--- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
+++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py
@@ -105,6 +105,5 @@ class DollyV2Config(openllm_core.LLMConfig):
        **attrs
    }, {}

-  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal['generated_text'], str]],
-                           **_: t.Any) -> str:
+  def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal['generated_text'], str]], **_: t.Any) -> str:
    return generation_result[0]['generated_text']
--- a/openllm-core/src/openllm_core/config/configuration_falcon.py
+++ b/openllm-core/src/openllm_core/config/configuration_falcon.py
@@ -39,21 +39,14 @@ class FalconConfig(openllm_core.LLMConfig):
  Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
  """
  __config__ = {
-      'name_type':
-          'lowercase',
-      'trust_remote_code':
-          True,
-      'requires_gpu':
-          True,
-      'timeout':
-          int(36e6),
-      'url':
-          'https://falconllm.tii.ae/',
+      'name_type': 'lowercase',
+      'trust_remote_code': True,
+      'requires_gpu': True,
+      'timeout': int(36e6),
+      'url': 'https://falconllm.tii.ae/',
      'requirements': ['einops', 'xformers'],
-      'architecture':
-          'FalconForCausalLM',
-      'default_id':
-          'tiiuae/falcon-7b',
+      'architecture': 'FalconForCausalLM',
+      'default_id': 'tiiuae/falcon-7b',
      'model_ids': ['tiiuae/falcon-7b', 'tiiuae/falcon-40b', 'tiiuae/falcon-7b-instruct', 'tiiuae/falcon-40b-instruct'],
      'fine_tune_strategies': ({
          'adapter_type': 'lora',
--- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py
+++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py
@@ -40,18 +40,11 @@ class FlanT5Config(openllm_core.LLMConfig):
  Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
  """
  __config__ = {
-      'url':
-          'https://huggingface.co/docs/transformers/model_doc/flan-t5',
-      'architecture':
-          'T5ForConditionalGeneration',
-      'model_type':
-          'seq2seq_lm',
-      'default_id':
-          'google/flan-t5-large',
-      'model_ids': [
-          'google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl',
-          'google/flan-t5-xxl',
-      ]
+      'url': 'https://huggingface.co/docs/transformers/model_doc/flan-t5',
+      'architecture': 'T5ForConditionalGeneration',
+      'model_type': 'seq2seq_lm',
+      'default_id': 'google/flan-t5-large',
+      'model_ids': ['google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl', 'google/flan-t5-xxl']
  }

  class GenerationConfig:
--- a/openllm-core/src/openllm_core/config/configuration_llama.py
+++ b/openllm-core/src/openllm_core/config/configuration_llama.py
@@ -66,31 +66,24 @@ class LlamaConfig(openllm_core.LLMConfig):
  Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
  for more information.
  """
-  use_llama2_prompt: bool = dantic.Field(
-      False, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.')
+  use_llama2_prompt: bool = dantic.Field(False, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.')
  __config__ = {
-      'name_type':
-          'lowercase',
-      'url':
-          'https://github.com/facebookresearch/llama',
+      'name_type': 'lowercase',
+      'url': 'https://github.com/facebookresearch/llama',
      'default_backend': {
          'cpu': 'pt',
          'nvidia.com/gpu': 'pt'
      },
-      'architecture':
-          'LlamaForCausalLM',
+      'architecture': 'LlamaForCausalLM',
      'requirements': ['fairscale', 'sentencepiece'],
-      'tokenizer_class':
-          'LlamaTokenizerFast',
-      'default_id':
-          'NousResearch/llama-2-7b-hf',
+      'tokenizer_class': 'LlamaTokenizerFast',
+      'default_id': 'NousResearch/llama-2-7b-hf',
      'model_ids': [
-          'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf',
-          'meta-llama/Llama-2-70b-hf', 'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-7b-hf',
-          'NousResearch/llama-2-70b-chat-hf', 'NousResearch/llama-2-13b-chat-hf', 'NousResearch/llama-2-7b-chat-hf',
-          'NousResearch/llama-2-70b-hf', 'NousResearch/llama-2-13b-hf', 'NousResearch/llama-2-7b-hf',
-          'openlm-research/open_llama_7b_v2', 'openlm-research/open_llama_3b_v2', 'openlm-research/open_llama_13b',
-          'huggyllama/llama-65b', 'huggyllama/llama-30b', 'huggyllama/llama-13b', 'huggyllama/llama-7b'
+          'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-70b-hf',
+          'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-7b-hf', 'NousResearch/llama-2-70b-chat-hf', 'NousResearch/llama-2-13b-chat-hf',
+          'NousResearch/llama-2-7b-chat-hf', 'NousResearch/llama-2-70b-hf', 'NousResearch/llama-2-13b-hf', 'NousResearch/llama-2-7b-hf',
+          'openlm-research/open_llama_7b_v2', 'openlm-research/open_llama_3b_v2', 'openlm-research/open_llama_13b', 'huggyllama/llama-65b',
+          'huggyllama/llama-30b', 'huggyllama/llama-13b', 'huggyllama/llama-7b'
      ],
      'fine_tune_strategies': ({
          'adapter_type': 'lora',
@@ -120,15 +113,14 @@ class LlamaConfig(openllm_core.LLMConfig):
                          use_default_prompt_template: bool = False,
                          use_llama2_prompt: bool = True,
                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-    return process_prompt(
-        prompt,
-        DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None,
-        use_default_prompt_template, **attrs), {
-            'max_new_tokens': max_new_tokens,
-            'temperature': temperature,
-            'top_p': top_p,
-            'top_k': top_k
-        }, {}
+    return process_prompt(prompt,
+                          DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None,
+                          use_default_prompt_template, **attrs), {
+                              'max_new_tokens': max_new_tokens,
+                              'temperature': temperature,
+                              'top_p': top_p,
+                              'top_k': top_k
+                          }, {}

  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
    return generation_result[0]
--- a/openllm-core/src/openllm_core/config/configuration_mpt.py
+++ b/openllm-core/src/openllm_core/config/configuration_mpt.py
@@ -44,12 +44,7 @@ _chat_prompt, _default_prompt, _instruct_prompt = '''{instruction}''', '''{instr
 {instruction}
 {response_key}
 '''.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction='{instruction}', response_key=RESPONSE_KEY)
-PROMPT_MAPPING = {
-    'default': _default_prompt,
-    'instruct': _instruct_prompt,
-    'storywriter': _default_prompt,
-    'chat': _chat_prompt
-}
+PROMPT_MAPPING = {'default': _default_prompt, 'instruct': _instruct_prompt, 'storywriter': _default_prompt, 'chat': _chat_prompt}

 def _get_prompt(model_type: str) -> str:
  return PROMPT_MAPPING[model_type]
@@ -66,27 +61,20 @@ class MPTConfig(openllm_core.LLMConfig):
  for more details on specific models.
  """
  __config__ = {
-      'name_type':
-          'lowercase',
-      'trust_remote_code':
-          True,
-      'url':
-          'https://huggingface.co/mosaicml',
-      'timeout':
-          int(36e6),
+      'name_type': 'lowercase',
+      'trust_remote_code': True,
+      'url': 'https://huggingface.co/mosaicml',
+      'timeout': int(36e6),
      'requirements': ['triton', 'einops'],
-      'architecture':
-          'MPTForCausalLM',
-      'default_id':
-          'mosaicml/mpt-7b-instruct',
+      'architecture': 'MPTForCausalLM',
+      'default_id': 'mosaicml/mpt-7b-instruct',
      'model_ids': [
-          'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter',
-          'mosaicml/mpt-30b', 'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat'
+          'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-30b',
+          'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat'
      ]
  }
-  prompt_type: MPTPromptType = dantic.Field(
-      '"default"',
-      description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.')
+  prompt_type: MPTPromptType = dantic.Field('"default"',
+                                            description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.')
  max_sequence_length: int = dantic.Field(
      2048,
      description=
@@ -106,7 +94,7 @@ class MPTConfig(openllm_core.LLMConfig):
                          prompt_type: MPTPromptType | None = None,
                          use_default_prompt_template: bool = True,
                          **attrs: t.Any,
-                         ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+                          ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    _template = None
    if use_default_prompt_template:
      if prompt_type is None:
--- a/openllm-core/src/openllm_core/config/configuration_opt.py
+++ b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -44,20 +44,12 @@ class OPTConfig(openllm_core.LLMConfig):
  Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
  """
  __config__ = {
-      'name_type':
-          'lowercase',
-      'trust_remote_code':
-          False,
-      'url':
-          'https://huggingface.co/docs/transformers/model_doc/opt',
-      'default_id':
-          'facebook/opt-1.3b',
-      'architecture':
-          'OPTForCausalLM',
-      'model_ids': [
-          'facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b',
-          'facebook/opt-66b'
-      ],
+      'name_type': 'lowercase',
+      'trust_remote_code': False,
+      'url': 'https://huggingface.co/docs/transformers/model_doc/opt',
+      'default_id': 'facebook/opt-1.3b',
+      'architecture': 'OPTForCausalLM',
+      'model_ids': ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-66b'],
      'fine_tune_strategies': ({
          'adapter_type': 'lora',
          'r': 16,
@@ -67,8 +59,7 @@ class OPTConfig(openllm_core.LLMConfig):
          'bias': 'none'
      },)
  }
-  format_outputs: bool = dantic.Field(
-      False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''')
+  format_outputs: bool = dantic.Field(False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''')

  class GenerationConfig:
    top_k: int = 15
--- a/openllm-core/src/openllm_core/config/configuration_stablelm.py
+++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py
@@ -47,17 +47,13 @@ class StableLMConfig(openllm_core.LLMConfig):
  for more information.
  """
  __config__ = {
-      'name_type':
-          'lowercase',
-      'url':
-          'https://github.com/Stability-AI/StableLM',
-      'architecture':
-          'GPTNeoXForCausalLM',
-      'default_id':
-          'stabilityai/stablelm-tuned-alpha-3b',
+      'name_type': 'lowercase',
+      'url': 'https://github.com/Stability-AI/StableLM',
+      'architecture': 'GPTNeoXForCausalLM',
+      'default_id': 'stabilityai/stablelm-tuned-alpha-3b',
      'model_ids': [
-          'stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b',
-          'stabilityai/stablelm-base-alpha-3b', 'stabilityai/stablelm-base-alpha-7b'
+          'stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', 'stabilityai/stablelm-base-alpha-3b',
+          'stabilityai/stablelm-base-alpha-7b'
      ]
  }

@@ -77,19 +73,10 @@ class StableLMConfig(openllm_core.LLMConfig):
                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    if 'tuned' in self._model_id and use_default_prompt_template:
      system_prompt = attrs.pop('system_prompt', SYSTEM_PROMPT)
-      prompt_text = process_prompt(prompt,
-                                   DEFAULT_PROMPT_TEMPLATE,
-                                   use_default_prompt_template,
-                                   system_prompt=system_prompt,
-                                   **attrs)
+      prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
    else:
      prompt_text = prompt
-    return prompt_text, {
-        'max_new_tokens': max_new_tokens,
-        'temperature': temperature,
-        'top_k': top_k,
-        'top_p': top_p
-    }, {}
+    return prompt_text, {'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'top_p': top_p}, {}

  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str:
    return generation_result[0]
--- a/openllm-core/src/openllm_core/utils/init.py
+++ b/openllm-core/src/openllm_core/utils/init.py
@@ -113,8 +113,7 @@ def field_env_key(key: str, suffix: str | None = None) -> str:
 # Special debug flag controled via OPENLLMDEVDEBUG
 DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False))
 # Whether to show the codenge for debug purposes
-SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and
-                                int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3)
+SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3)
 # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
 MYPY = False

@@ -125,7 +124,6 @@ def get_quiet_mode() -> bool:
  return not DEBUG and _get_quiet_mode()

 class ExceptionFilter(logging.Filter):
-
  def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
    '''A filter of all exception.'''
    if exclude_exceptions is None: exclude_exceptions = [ConflictError]
@@ -142,7 +140,6 @@ class ExceptionFilter(logging.Filter):
    return True

 class InfoFilter(logging.Filter):
-
  def filter(self, record: logging.LogRecord) -> bool:
    return logging.INFO <= record.levelno < logging.WARNING

@@ -246,7 +243,6 @@ def compose(*funcs: AnyCallable) -> AnyCallable:
  >>> [f(3*x, x+1) for x in range(1,10)]
  [1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7]
  '''
-
  def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable:
    return lambda *args, **kwargs: f1(f2(*args, **kwargs))

@@ -303,11 +299,7 @@ def generate_context(framework_name: str) -> _ModelContext:
    from bentoml._internal.frameworks.utils.tensorflow import get_tf_version
    framework_versions['tensorflow'] = get_tf_version()
  if openllm_core.utils.is_flax_available():
-    framework_versions.update({
-        'flax': pkg.get_pkg_version('flax'),
-        'jax': pkg.get_pkg_version('jax'),
-        'jaxlib': pkg.get_pkg_version('jaxlib')
-    })
+    framework_versions.update({'flax': pkg.get_pkg_version('flax'), 'jax': pkg.get_pkg_version('jax'), 'jaxlib': pkg.get_pkg_version('jaxlib')})
  return _ModelContext(framework_name=framework_name, framework_versions=framework_versions)

 _TOKENIZER_PREFIX = '_tokenizer_'
@@ -327,9 +319,7 @@ _whitelist_modules = {'pkg'}
 # XXX: define all classes, functions import above this line
 # since _extras will be the locals() import from this file.
 _extras: dict[str, t.Any] = {
-    k: v
-    for k, v in locals().items()
-    if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))
+    k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))
 }
 _extras['__openllm_migration__'] = {'ModelEnv': 'EnvVarMixin'}
 _import_structure: dict[str, list[str]] = {
@@ -339,11 +329,10 @@ _import_structure: dict[str, list[str]] = {
    'lazy': [],
    'representation': ['ReprMixin'],
    'import_utils': [
-        'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available',
-        'is_einops_available', 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available',
-        'is_bitsandbytes_available', 'is_peft_available', 'is_datasets_available', 'is_transformers_supports_kbit',
-        'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available', 'is_notebook_available',
-        'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available', 'is_xformers_available',
+        'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available', 'is_einops_available',
+        'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available', 'is_bitsandbytes_available', 'is_peft_available',
+        'is_datasets_available', 'is_transformers_supports_kbit', 'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available',
+        'is_notebook_available', 'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available', 'is_xformers_available',
        'is_fairscale_available', 'is_grpc_available', 'is_grpc_health_available', 'is_transformers_available'
    ]
 }
--- a/openllm-core/src/openllm_core/utils/analytics.py
+++ b/openllm-core/src/openllm_core/utils/analytics.py
@@ -34,7 +34,6 @@ def _usage_event_debugging() -> bool:
  return os.environ.get('__BENTOML_DEBUG_USAGE', str(False)).lower() == 'true'

 def silent(func: t.Callable[P, T]) -> t.Callable[P, T]:
-
  @functools.wraps(func)
  def wrapper(*args: P.args, **kwargs: P.kwargs) -> t.Any:
    try:
@@ -62,7 +61,6 @@ def set_bentoml_tracking() -> t.Generator[None, None, None]:
    os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = original_value

 class EventMeta:
-
  @property
  def event_name(self) -> str:
    # camel case to snake case
--- a/openllm-core/src/openllm_core/utils/codegen.py
+++ b/openllm-core/src/openllm_core/utils/codegen.py
@@ -110,8 +110,7 @@ def generate_function(typ: type[t.Any],
                      globs: dict[str, t.Any],
                      annotations: dict[str, t.Any] | None = None) -> AnyCallable:
  from openllm_core.utils import SHOW_CODEGEN
-  script = 'def %s(%s):\n    %s\n' % (func_name, ', '.join(args) if args is not None else '',
-                                      '\n    '.join(lines) if lines else 'pass')
+  script = 'def %s(%s):\n    %s\n' % (func_name, ', '.join(args) if args is not None else '', '\n    '.join(lines) if lines else 'pass')
  meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
  if annotations: meth.__annotations__ = annotations
  if SHOW_CODEGEN: print('Generated script for {typ}:\n\n', script)
@@ -122,7 +121,7 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig],
                         suffix: LiteralString | None = None,
                         default_callback: t.Callable[[str, t.Any], t.Any] | None = None,
                         globs: DictStrAny | None = None,
-                        ) -> AnyCallable:
+                         ) -> AnyCallable:
  from openllm_core.utils import dantic
  from openllm_core.utils import field_env_key

@@ -171,16 +170,15 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
  return t.cast(
      _T,
      functools.update_wrapper(
-          types.new_class(
-              name, (t.cast('PartialAny', functools.partial), ReprMixin),
-              exec_body=lambda ns: ns.update({
-                  '__repr_keys__': property(lambda _: [i for i in _signatures.keys() if not i.startswith('_')]),
-                  '__repr_args__': _repr_args,
-                  '__repr__': _repr,
-                  '__doc__': inspect.cleandoc(doc),
-                  '__module__': 'openllm'
-              }),
-          )(func, **attrs), func,
+          types.new_class(name, (t.cast('PartialAny', functools.partial), ReprMixin),
+                          exec_body=lambda ns: ns.update({
+                              '__repr_keys__': property(lambda _: [i for i in _signatures.keys() if not i.startswith('_')]),
+                              '__repr_args__': _repr_args,
+                              '__repr__': _repr,
+                              '__doc__': inspect.cleandoc(doc),
+                              '__module__': 'openllm'
+                          }),
+                          )(func, **attrs), func,
      ))

 __all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function']
--- a/openllm-core/src/openllm_core/utils/dantic.py
+++ b/openllm-core/src/openllm_core/utils/dantic.py
@@ -25,9 +25,8 @@ AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar('FC', bound=t.Union[AnyCallable, click.Command])

 __all__ = [
-    'FC', 'attrs_to_options', 'Field', 'parse_type', 'is_typing', 'is_literal', 'ModuleType', 'EnumChoice',
-    'LiteralChoice', 'allows_multiple', 'is_mapping', 'is_container', 'parse_container_args', 'parse_single_arg',
-    'CUDA', 'JsonType', 'BytesType'
+    'FC', 'attrs_to_options', 'Field', 'parse_type', 'is_typing', 'is_literal', 'ModuleType', 'EnumChoice', 'LiteralChoice', 'allows_multiple',
+    'is_mapping', 'is_container', 'parse_container_args', 'parse_single_arg', 'CUDA', 'JsonType', 'BytesType'
 ]

 def __dir__() -> list[str]:
@@ -64,7 +63,7 @@ def attrs_to_options(name: str,
                             help=field.metadata.get('description', '(No description provided)'),
                             show_envvar=True,
                             envvar=envvar,
-                            )
+                             )

 def env_converter(value: t.Any, env: str | None = None) -> t.Any:
  if env is not None:
--- a/openllm-core/src/openllm_core/utils/import_utils.py
+++ b/openllm-core/src/openllm_core/utils/import_utils.py
@@ -27,9 +27,7 @@ if t.TYPE_CHECKING:
  BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]

 logger = logging.getLogger(__name__)
-OPTIONAL_DEPENDENCIES = {
-    'opt', 'flan-t5', 'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq', 'grpc'
-}
+OPTIONAL_DEPENDENCIES = {'opt', 'flan-t5', 'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq', 'grpc'}
 ENV_VARS_TRUE_VALUES = {'1', 'ON', 'YES', 'TRUE'}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({'AUTO'})
 USE_TF = os.environ.get('USE_TF', 'AUTO').upper()
@@ -144,10 +142,9 @@ def is_tf_available() -> bool:
    _tf_version = None
    if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
      if _tf_available:
-        candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu',
-                      'tf-nightly-gpu', 'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm',
-                      'tensorflow-macos', 'tensorflow-aarch64',
-                     )
+        candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow',
+                      'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', 'tensorflow-aarch64',
+                      )
        _tf_version = None
        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
        for _pkg in candidates:
@@ -285,18 +282,20 @@ You can install it with pip: `pip install fairscale`. Please note that you may n
 your runtime after installation.
 '''

-BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([
-    ('flax', (is_flax_available, FLAX_IMPORT_ERROR)), ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
-    ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)), ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)),
-    ('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)),
-    ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)), ('triton', (is_triton_available, TRITON_IMPORT_ERROR)),
-    ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)), ('peft', (is_peft_available, PEFT_IMPORT_ERROR)),
-    ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
-    ('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
-    ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
-    ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)),
-    ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))
-])
+BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, FLAX_IMPORT_ERROR)),
+                                                    ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+                                                    ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)),
+                                                    ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)),
+                                                    ('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)),
+                                                    ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)),
+                                                    ('triton', (is_triton_available, TRITON_IMPORT_ERROR)),
+                                                    ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)),
+                                                    ('peft', (is_peft_available, PEFT_IMPORT_ERROR)),
+                                                    ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
+                                                    ('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)),
+                                                    ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+                                                    ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)),
+                                                    ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))])

 class DummyMetaclass(abc.ABCMeta):
  '''Metaclass for dummy object.
@@ -326,9 +325,7 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None:
      raise ImportError(VLLM_IMPORT_ERROR_WITH_TF.format(name))
    if 'flax' not in backends and is_flax_available() and not is_vllm_available():
      raise ImportError(VLLM_IMPORT_ERROR_WITH_FLAX.format(name))
-  failed = [
-      msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available()
-  ]
+  failed = [msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available()]
  if failed: raise ImportError(''.join(failed))

 class EnvVarMixin(ReprMixin):
@@ -371,11 +368,7 @@ class EnvVarMixin(ReprMixin):
    elif hasattr(self, item): return getattr(self, item)
    raise KeyError(f'Key {item} not found in {self}')

-  def __init__(self,
-               model_name: str,
-               backend: LiteralBackend = 'pt',
-               model_id: str | None = None,
-               quantize: LiteralString | None = None) -> None:
+  def __init__(self, model_name: str, backend: LiteralBackend = 'pt', model_id: str | None = None, quantize: LiteralString | None = None) -> None:
    '''EnvVarMixin is a mixin class that returns the value extracted from environment variables.'''
    from openllm_core.utils import field_env_key
    self.model_name = inflection.underscore(model_name)
@@ -387,8 +380,7 @@ class EnvVarMixin(ReprMixin):

  def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None:
    from . import first_not_none
-    return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']],
-                  first_not_none(os.environ.get(self['quantize']), default=self._quantize))
+    return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], first_not_none(os.environ.get(self['quantize']), default=self._quantize))

  def _backend_value(self) -> LiteralBackend:
    from . import first_not_none
--- a/openllm-core/src/openllm_core/utils/lazy.py
+++ b/openllm-core/src/openllm_core/utils/lazy.py
@@ -110,8 +110,7 @@ class LazyModule(types.ModuleType):
    It also contains a special case for all of the metadata information, such as __version__ and __version_info__.
    '''
    if name in _reserved_namespace:
-      raise openllm_core.exceptions.ForbiddenAttributeError(
-          f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
+      raise openllm_core.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
    dunder_to_metadata = {
        '__title__': 'Name',
        '__copyright__': '',
@@ -147,10 +146,9 @@ class LazyModule(types.ModuleType):
    if '__openllm_migration__' in self._objects:
      cur_value = self._objects['__openllm_migration__'].get(name, _sentinel)
      if cur_value is not _sentinel:
-        warnings.warn(
-            f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead",
-            DeprecationWarning,
-            stacklevel=3)
+        warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead",
+                      DeprecationWarning,
+                      stacklevel=3)
        return getattr(self, cur_value)
    if name in self._objects: return self._objects.__getitem__(name)
    if name in self._modules: value = self._get_module(name)
@@ -165,9 +163,7 @@ class LazyModule(types.ModuleType):
    try:
      return importlib.import_module('.' + module_name, self.__name__)
    except Exception as e:
-      raise RuntimeError(
-          f'Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}'
-      ) from e
+      raise RuntimeError(f'Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}') from e

  # make sure this module is picklable
  def __reduce__(self) -> tuple[type[LazyModule], tuple[str, str | None, dict[str, list[str]]]]:
--- a/openllm-core/src/openllm_core/utils/representation.py
+++ b/openllm-core/src/openllm_core/utils/representation.py
@@ -14,7 +14,6 @@ if t.TYPE_CHECKING:
 ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None]

 class ReprMixin:
-
  @property
  @abstractmethod
  def __repr_keys__(self) -> set[str]:
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -26,14 +26,11 @@ else:
  # configuration for bitsandbytes before import
  _os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1")
  # NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
-  _warnings.filterwarnings(
-      "ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
-  _warnings.filterwarnings(
-      "ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
+  _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
+  _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
  _warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
  # NOTE: ignore the following warning from ghapi as it is not important for users
-  _warnings.filterwarnings("ignore",
-                           message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
+  _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")

 _import_structure: dict[str, list[str]] = {
    "exceptions": [],
@@ -48,13 +45,8 @@ _import_structure: dict[str, list[str]] = {
    "_quantisation": ["infer_quantisation_config"],
    "_embeddings": ["GenericEmbeddingRunnable"],
    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"],
-    "_generation": [
-        "StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
-        "prepare_logits_processor"
-    ],
-    "models.auto": [
-        "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"
-    ],
+    "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
+    "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"],
    "models.chatglm": [],
    "models.baichuan": [],
    "models.dolly_v2": [],
@@ -114,8 +106,7 @@ try:
  if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_pt_objects"] = [
-      name for name in dir(utils.dummy_pt_objects)
-      if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
+      name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
  ]
 else:
  _import_structure["models.flan_t5"].extend(["FlanT5"])
--- a/openllm-python/src/openllm/_assign.py
+++ b/openllm-python/src/openllm/_assign.py
@@ -36,7 +36,6 @@ else:
  vllm = LazyLoader('vllm', globals(), 'vllm')

 def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]:
-
  @functools.wraps(fn)
  def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model:
    trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code)
@@ -48,7 +47,6 @@ def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[
  return inner

 def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]:
-
  @functools.wraps(fn)
  def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine:
    if self.__llm_backend__ == 'vllm':
@@ -71,7 +69,6 @@ def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vll
  return inner

 def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]:
-
  @functools.wraps(fn)
  def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T:
    return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs})
@@ -79,7 +76,6 @@ def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]],
  return inner

 def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]:
-
  @functools.wraps(fn)
  def inner(self: LLM[M, T]) -> None:
    if self.__llm_backend__ == 'pt' and is_torch_available():
@@ -98,8 +94,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
  args: ListStr = []
  globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM}
  # _cached_LLMFunction_get and _ccached_LLMSerialisation_get
-  globs.update(
-      {f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
+  globs.update({f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}})
  # llm_post_init implementation
  lines: ListStr = [
      f'_impl_{cls.__name__}_func=cls.llm_post_init',
@@ -112,17 +107,13 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
    globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl})
    cached_func_name = f'_cached_{cls.__name__}_func'
    func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}"
-    lines.extend([
-        f'{cached_func_name}=cls.{func}', func_call,
-        _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')
-    ])
+    lines.extend([f'{cached_func_name}=cls.{func}', func_call, _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')])

  # assign vLLM implementation
  if cls.__llm_backend__ == 'vllm':
    vllm_func = {
        f'_vllm_{it}': fn
-        for it, fn in zip(('generate', 'generate_iterator',
-                           'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
+        for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate))
    }
    globs.update(vllm_func)
    lines.extend([_setattr_class(it[6:], it) for it in vllm_func])
@@ -141,8 +132,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
    return f'__llm_supports_{key}__'

  bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')}
-  lines.extend(
-      [_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])
+  lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr])

  return codegen.generate_function(cls,
                                   '__assign_llm_attr',
@@ -154,8 +144,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N
                                       'return': None
                                   })

-def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]],
-                              **_: t.Any) -> str:
+def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
  return generation_result[0]['outputs'][0]['text']

 def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T],
@@ -193,9 +182,7 @@ def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) -
  if request_id is None: raise ValueError('request_id must not be None.')
  outputs: list[vllm.RequestOutput] = []
  # TODO: support prompt_token_ids
-  self.model.add_request(request_id=request_id,
-                         prompt=prompt,
-                         sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
+  self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config())
  while self.model.has_unfinished_requests():
    outputs.extend([r for r in self.model.step() if r.finished])
  return [unmarshal_vllm_outputs(i) for i in outputs]
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -25,9 +25,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
    return bentoml.transformers.get(ids)
  except bentoml.exceptions.NotFound:
    model_signatures = {
-        k: ModelSignature(batchable=False)
-        for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
-                  'group_beam_search', 'constrained_beam_search', '__call__')
+        k: ModelSignature(batchable=False) for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search',
+                                                    'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
    }
    with bentoml.models.create(ids,
                               module=MODULE_NAME,
@@ -39,11 +38,10 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
                                   'framework': 'openllm'
                               },
                               signatures=model_signatures) as bentomodel:
-      snapshot_download(
-          _GENERIC_EMBEDDING_ID,
-          local_dir=bentomodel.path,
-          local_dir_use_symlinks=False,
-          ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
+      snapshot_download(_GENERIC_EMBEDDING_ID,
+                        local_dir=bentomodel.path,
+                        local_dir_use_symlinks=False,
+                        ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
      return bentomodel

 class GenericEmbeddingRunnable(bentoml.Runnable):
@@ -68,10 +66,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
      model_output = self.model(**encoded_input)
    # Perform pooling and normalize
    sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
-    return [
-        openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(),
-                                 num_tokens=int(torch.sum(attention_mask).item()))
-    ]
+    return [openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]

  @staticmethod
  def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -14,18 +14,15 @@ LogitsProcessorList = transformers.LogitsProcessorList
 StoppingCriteriaList = transformers.StoppingCriteriaList

 class StopSequenceCriteria(transformers.StoppingCriteria):
-
-  def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer |
-               transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
+  def __init__(self, stop_sequences: str | list[str],
+               tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
    if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
    self.stop_sequences, self.tokenizer = stop_sequences, tokenizer

  def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
-    return any(
-        self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
+    return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)

 class StopOnTokens(transformers.StoppingCriteria):
-
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
    return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}

--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -122,7 +122,6 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp
 _reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'}

 class LLMFunction(abc.ABC):
-
  @abc.abstractmethod
  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
    '''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
@@ -158,8 +157,7 @@ class LLMFunction(abc.ABC):
    '''
    raise NotImplementedError

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
    '''The entrypoint for generating one prompt.

    This provides additional stop tokens for generating per token level. This is useful when running with agents, or initial streaming support.
@@ -177,7 +175,6 @@ class LLMFunction(abc.ABC):
    raise NotImplementedError

 class LLMSerialisation(abc.ABC, t.Generic[M, T]):
-
  def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model:
    '''Import both model and tokenizer weights into as a BentoML models.

@@ -206,7 +203,6 @@ class LLMSerialisation(abc.ABC, t.Generic[M, T]):
    raise NotImplementedError

 class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC):
-
  def llm_post_init(self) -> None:
    '''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals.
    By default, this will add `self.device` if the implementation is PyTorch.
@@ -282,12 +278,12 @@ class LLM(LLMInterface[M, T], ReprMixin):
  if t.TYPE_CHECKING: __name__: str
  if t.TYPE_CHECKING and not MYPY:

-    def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig,
-                                                                                        autogptq.BaseQuantizeConfig]],
-                       model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny,
-                       tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
-                       quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
-                       serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None:
+    def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
+                       model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag,
+                       adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str],
+                       quantize_method: t.Optional[t.Literal['int8', 'int4',
+                                                             'gptq']], serialisation_format: t.Literal['safetensors',
+                                                                                                       'legacy'], _local: bool, **attrs: t.Any) -> None:
      '''Generated __attrs_init__ for openllm.LLM.'''

  config: LLMConfig
@@ -434,20 +430,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
    '''
    cfg_cls = cls.config_class
    _local = False
-    _model_id: str = first_not_none(model_id,
-                                    os.environ.get(cfg_cls.__openllm_env__['model_id']),
-                                    default=cfg_cls.__openllm_default_id__)
+    _model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
    if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
    quantize = first_not_none(quantize,
-                              t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']],
-                                     os.environ.get(cfg_cls.__openllm_env__['quantize'])),
+                              t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])),
                              default=None)

    # quantization setup
    if quantization_config and quantize:
      raise ValueError(
-          "'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument."
-      )
+          "'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
    if quantization_config is None and quantize is not None:
      quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
    if quantize == 'gptq': serialisation = 'safetensors'
@@ -460,9 +452,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
      )
    if adapter_map is None and adapter_id is not None: adapter_map = {adapter_id: adapter_name}
    if adapter_map is not None and not is_peft_available():
-      raise RuntimeError(
-          "LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
-      )
+      raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
    if adapter_map: logger.debug('OpenLLM will apply the following adapters layers: %s', list(adapter_map))

    if llm_config is None:
@@ -517,16 +507,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
    model_id, *maybe_revision = model_id.rsplit(':')
    if len(maybe_revision) > 0:
      if model_version is not None:
-        logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",
-                       maybe_revision[0], model_version)
+        logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
      return f'{cls.__llm_backend__}-{model_name}:{maybe_revision[0]}'

    tag_name = f'{cls.__llm_backend__}-{model_name}'
    if openllm_core.utils.check_bool_env('OPENLLM_USE_LOCAL_LATEST', False):
      return str(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag)
    if validate_is_path(model_id):
-      model_id, model_version = resolve_filepath(model_id), first_not_none(model_version,
-                                                                           default=generate_hash_from_file(model_id))
+      model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
    else:
      from .serialisation.transformers._helpers import process_config
      model_version = getattr(
@@ -542,11 +530,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
    return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))

  def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig,
-               quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
-               _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag,
-               _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
+               quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None,
+               _tag: bentoml.Tag, _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str,
               _serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any,
-              ):
+               ):
    '''Initialize the LLM with given pretrained model.

    > [!WARNING]
@@ -662,8 +649,7 @@ class LLM(LLMInterface[M, T], ReprMixin):

  @property
  def trust_remote_code(self) -> bool:
-    return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'),
-                          default=self.config['trust_remote_code'])
+    return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'), default=self.config['trust_remote_code'])

  @property
  def adapters_mapping(self) -> AdaptersMapping | None:
@@ -698,10 +684,7 @@ class LLM(LLMInterface[M, T], ReprMixin):

  @property
  def identifying_params(self) -> DictStrAny:
-    return {
-        'configuration': self.config.model_dump_json().decode(),
-        'model_ids': orjson.dumps(self.config['model_ids']).decode()
-    }
+    return {'configuration': self.config.model_dump_json().decode(), 'model_ids': orjson.dumps(self.config['model_ids']).decode()}

  @property
  def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
@@ -755,8 +738,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
      model = self.load_model(*self._model_decls, **self._model_attrs)
      # If OOM, then it is probably you don't have enough VRAM to run this model.
      if self.__llm_backend__ == 'pt' and is_torch_available():
-        loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(
-            model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
+        loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(
+            model, 'is_quantized', False)
        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
          try:
            model = model.to('cuda')
@@ -785,24 +768,20 @@ class LLM(LLMInterface[M, T], ReprMixin):
    _converted_first_none = False
    for _adapter_type, _adapters_tuples in self._adapters_mapping.items():
      strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type),
-                                default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
-                                                       llm_config_class=self.config_class))
+                                default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), llm_config_class=self.config_class))
      default_config = strategy.eval() if inference_mode else strategy.train()
      for adapter in _adapters_tuples:
        if not adapter.name and _converted_first_none:
-          raise ValueError(
-              f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}"
-          )
+          raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}")
        name = adapter.name
        if name is None:
          _converted_first_none = True
          name = 'default'
        peft_config = default_config.with_config(
-            **adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(
-                adapter_type=t.cast('PeftType', _adapter_type),
-                adapter_config=adapter.config,
-                inference_mode=inference_mode,
-                llm_config_class=self.config_class).to_peft_config()
+            **adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type),
+                                                                                        adapter_config=adapter.config,
+                                                                                        inference_mode=inference_mode,
+                                                                                        llm_config_class=self.config_class).to_peft_config()
        adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id)
    if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map
    return adapter_map
@@ -834,8 +813,7 @@ class LLM(LLMInterface[M, T], ReprMixin):

    _mapping = self._transpose_adapter_mapping(inference_mode=inference_mode, use_cache=use_cache)
    if adapter_type not in _mapping:
-      raise ValueError(
-          f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}')
+      raise ValueError(f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}')
    adapter_mapping = _mapping[adapter_type]

    self.__llm_model__ = self._wrap_default_peft_model(adapter_mapping, inference_mode=inference_mode)
@@ -857,25 +835,21 @@ class LLM(LLMInterface[M, T], ReprMixin):

    return self.__llm_model__

-  def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]],
-                               inference_mode: bool) -> M:
+  def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]], inference_mode: bool) -> M:
    if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly')
    if isinstance(self.__llm_model__, peft.PeftModel): return self.__llm_model__
    if not isinstance(self.__llm_model__, transformers.PreTrainedModel):
      raise ValueError('Loading LoRA layers currently only runs on PyTorch models.')

    if 'default' not in adapter_mapping:
-      raise ValueError(
-          "There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.")
+      raise ValueError("There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.")
    default_config, peft_model_id = adapter_mapping.pop('default')

    # the below shared similar logics with `get_peft_model`
    # TODO: Support PromptLearningConfig
-    if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(
-        default_config, peft.PromptLearningConfig):
-      logger.debug(
-          "Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
-          default_config.task_type)
+    if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig):
+      logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.",
+                   default_config.task_type)
      model = peft.PeftModel(self.__llm_model__, default_config)
    else:
      # XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0
@@ -894,12 +868,11 @@ class LLM(LLMInterface[M, T], ReprMixin):

  # order of these fields matter here, make sure to sync it with
  # openllm.models.auto.factory.BaseAutoLLMClass.for_model
-  def to_runner(
-      self,
-      models: list[bentoml.Model] | None = None,
-      max_batch_size: int | None = None,
-      max_latency_ms: int | None = None,
-      scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
+  def to_runner(self,
+                models: list[bentoml.Model] | None = None,
+                max_batch_size: int | None = None,
+                max_latency_ms: int | None = None,
+                scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
    '''Convert this LLM into a Runner.

    Args:
@@ -1047,10 +1020,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
        else:
          tmp_output_ids = output_ids[input_echo_len:]
          rfind_start = 0
-        output = self.tokenizer.decode(tmp_output_ids,
-                                       skip_special_tokens=True,
-                                       spaces_between_special_tokens=False,
-                                       clean_up_tokenization_spaces=True)
+        output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)

        partially_stopped = False
        if stop:
@@ -1183,25 +1153,17 @@ def Runner(model_name: str,
  '''
  if llm_config is not None:
    attrs.update({
-        'model_id':
-            llm_config['env']['model_id_value'],
-        'quantize':
-            llm_config['env']['quantize_value'],
-        'serialisation':
-            first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
+        'model_id': llm_config['env']['model_id_value'],
+        'quantize': llm_config['env']['quantize_value'],
+        'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors')
    })

  backend = t.cast(
      LiteralBackend,
      first_not_none(backend,
-                     default=EnvVarMixin(
-                         model_name,
-                         backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
+                     default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value']))
  if init_local: ensure_available = True
-  runner = infer_auto_class(backend).create_runner(model_name,
-                                                   llm_config=llm_config,
-                                                   ensure_available=ensure_available,
-                                                   **attrs)
+  runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs)
  if init_local: runner.init_local(quiet=True)
  return runner

@@ -1214,7 +1176,6 @@ class SetAdapterOutput(t.TypedDict):

 def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature,
                       generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]:
-
  class _Runnable(bentoml.Runnable):
    SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
    SUPPORTS_CPU_MULTI_THREADING = True
@@ -1252,8 +1213,7 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
      return self.generate(prompt, **attrs)

    @bentoml.Runnable.method(**method_signature(generate_sig))  # type: ignore
-    def generate_one(__self: _Runnable, prompt: str, stop: list[str],
-                     **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
+    def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]:
      adapter_name = attrs.pop('adapter_name', None)
      if adapter_name is not None: __self.set_adapter(adapter_name)
      return self.generate_one(prompt, stop, **attrs)
@@ -1275,22 +1235,15 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate

  return types.new_class(
      self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({
-          'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu')
-                                 if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
-          '__module__':
-              self.__module__,
-          '__doc__':
-              self.config['env'].start_docstring
+          'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'),
+          '__module__': self.__module__,
+          '__doc__': self.config['env'].start_docstring
      }))

 def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
-
  def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput:
    if not is_peft_available():
-      return PeftAdapterOutput(
-          success=False,
-          result={},
-          error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'")
+      return PeftAdapterOutput(success=False, result={}, error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'")
    if self.__llm_adapter_map__ is None:
      return PeftAdapterOutput(success=False, result={}, error_msg='No adapters available for current running server.')
    if not isinstance(self.model, peft.PeftModel):
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -16,8 +16,7 @@ if t.TYPE_CHECKING:
  from ._llm import LLM

 autogptq, torch, transformers = LazyLoader('autogptq', globals(),
-                                           'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader(
-                                               'transformers', globals(), 'transformers')
+                                           'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')

 logger = logging.getLogger(__name__)

@@ -33,9 +32,8 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal[
                              **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
  ...

-def infer_quantisation_config(
-    cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
-    **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
+                              **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
  # 8 bit configuration
  int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
  int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -61,7 +59,7 @@ def infer_quantisation_config(
                                           llm_int8_threshhold=int8_threshold,
                                           llm_int8_skip_modules=int8_skip_modules,
                                           llm_int8_has_fp16_weight=int8_has_fp16_weight,
-                                          )
+                                           )

  # 4 bit configuration
  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
@@ -72,9 +70,7 @@ def infer_quantisation_config(
  # quantize is a openllm.LLM feature, where we can quantize the model
  # with bitsandbytes or quantization aware training.
  if not is_bitsandbytes_available():
-    raise RuntimeError(
-        "Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
-    )
+    raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
  if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
  elif quantise == 'int4':
    if is_transformers_supports_kbit():
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -21,35 +21,28 @@ if t.TYPE_CHECKING:
  from bentoml._internal.runner.runner import AbstractRunner
  from bentoml._internal.runner.runner import RunnerMethod
  from openllm_core._typing_compat import TypeAlias
-  _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]],
-                                             [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
+  _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]],
+                                             t.Sequence[openllm.EmbeddingsOutput]]

 # The following warnings from bitsandbytes, and probably not that important for users to see
-warnings.filterwarnings('ignore',
-                        message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
-warnings.filterwarnings('ignore',
-                        message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
+warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
+warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
 warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')

 model = os.environ.get('OPENLLM_MODEL', '{__model_name__}')  # openllm: model name
 adapter_map = os.environ.get('OPENLLM_ADAPTER_MAP', '''{__model_adapter_map__}''')  # openllm: model adapter map
 llm_config = openllm.AutoConfig.for_model(model)
 runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map))
-generic_embedding_runner = bentoml.Runner(
-    openllm.GenericEmbeddingRunnable,  # XXX: remove arg-type once bentoml.Runner is correct set with type
-    name='llm-generic-embedding',
-    scheduling_strategy=openllm_core.CascadingResourceStrategy,
-    max_batch_size=32,
-    max_latency_ms=300)
+generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable,  # XXX: remove arg-type once bentoml.Runner is correct set with type
+                                          name='llm-generic-embedding',
+                                          scheduling_strategy=openllm_core.CascadingResourceStrategy,
+                                          max_batch_size=32,
+                                          max_latency_ms=300)
 runners: list[AbstractRunner] = [runner]
 if not runner.supports_embeddings: runners.append(generic_embedding_runner)
 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)

-_JsonInput = bentoml.io.JSON.from_sample({
-    'prompt': '',
-    'llm_config': llm_config.model_dump(flatten=True),
-    'adapter_name': None
-})
+_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})

@svc.api(route='/v1/generate',
         input=_JsonInput,
@@ -67,10 +60,7 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  echo = input_dict.pop('echo', False)
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
-  return runner.generate_iterator.async_stream(qa_inputs.prompt,
-                                               adapter_name=qa_inputs.adapter_name,
-                                               echo=echo,
-                                               **qa_inputs.llm_config.model_dump())
+  return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())

@svc.api(route='/v1/metadata',
         input=bentoml.io.Text(),
@@ -96,12 +86,10 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
         input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
         output=bentoml.io.JSON.from_sample({
             'embeddings': [
-                 0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008,
-                 -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362,
-                 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
-                 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918,
-                 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076,
-                 -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
+                 0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752,
+                 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
+                 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679,
+                 -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
                 -0.014814382418990135, 0.01796768605709076
             ],
             'num_tokens': 20
@@ -121,8 +109,7 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
      raise openllm.exceptions.OpenLLMException(f'Invalid JSON input received: {err}') from None
    stop = input_data.parameters.pop('stop', ['\n'])
    try:
-      return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters),
-                          status_code=200)
+      return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200)
    except NotImplementedError:
      return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)

--- a/openllm-python/src/openllm/bundle/init.py
+++ b/openllm-python/src/openllm/bundle/init.py
@@ -10,10 +10,7 @@ from openllm_core.utils import LazyModule

 _import_structure: dict[str, list[str]] = {
    '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
-    'oci': [
-        'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name',
-        'supported_registries', 'RefResolver'
-    ]
+    'oci': ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
 }

 if t.TYPE_CHECKING:
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -43,8 +43,7 @@ logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'

-def build_editable(path: str,
-                   package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
+def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
  '''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
  if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None
  # We need to build the package in editable mode, so that we can import it
@@ -52,9 +51,7 @@ def build_editable(path: str,
  from build.env import IsolatedEnvBuilder
  module_location = openllm_core.utils.pkg.source_locations(package)
  if not module_location:
-    raise RuntimeError(
-        'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
-    )
+    raise RuntimeError('Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.')
  pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
  if os.path.isfile(pyproject_path.__fspath__()):
    logger.info('Generating built wheels for package %s...', package)
@@ -64,14 +61,13 @@ def build_editable(path: str,
      builder.scripts_dir = env.scripts_dir
      env.install(builder.build_system_requires)
      return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
-  raise RuntimeError(
-      'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
+  raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')

 def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
                             llm_fs: FS,
                             extra_dependencies: tuple[str, ...] | None = None,
                             adapter_map: dict[str, str | None] | None = None,
-                            ) -> PythonOptions:
+                             ) -> PythonOptions:
  packages = ['openllm', 'scipy']  # apparently bnb misses this one
  if adapter_map is not None: packages += ['openllm[fine-tune]']
  # NOTE: add openllm to the default dependencies
@@ -90,16 +86,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
  if backend_envvar == 'flax':
    if not openllm_core.utils.is_flax_available():
      raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'")
-    packages.extend(
-        [importlib.metadata.version('flax'),
-         importlib.metadata.version('jax'),
-         importlib.metadata.version('jaxlib')])
+    packages.extend([importlib.metadata.version('flax'), importlib.metadata.version('jax'), importlib.metadata.version('jaxlib')])
  elif backend_envvar == 'tf':
    if not openllm_core.utils.is_tf_available():
      raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'")
-    candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
-                  'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
-                 )
+    candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow',
+                  'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
+                  )
    # For the metadata, we have to look for both tensorflow and tensorflow-cpu
    for candidate in candidates:
      try:
@@ -125,15 +118,11 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
  return PythonOptions(packages=packages,
                       wheels=wheels,
                       lock_packages=False,
-                       extra_index_url=[
-                           'https://download.pytorch.org/whl/cu118',
-                           'https://huggingface.github.io/autogptq-index/whl/cu118/'
-                       ])
+                       extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/'])

-def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
-                             quantize: LiteralString | None, adapter_map: dict[str, str | None] | None,
-                             dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'],
-                             container_registry: LiteralContainerRegistry,
+def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None,
+                             adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
+                             serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry,
                             container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
  from openllm.cli._factory import parse_config_options
  environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
@@ -156,10 +145,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize)

  if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
-  return DockerOptions(
-      base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
-      env=env_dict,
-      dockerfile_template=dockerfile_template)
+  return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
+                       env=env_dict,
+                       dockerfile_template=dockerfile_template)

 OPENLLM_MODEL_NAME = '# openllm: model name'
 OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
@@ -193,17 +181,15 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
  from openllm_core.utils import DEBUG
  model_name = llm.config['model_name']
-  logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'],
-               llm_fs.getsyspath('/'))
+  logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/'))
  with open(_service_file.__fspath__(), 'r') as f:
    src_contents = f.readlines()
  for it in src_contents:
    if OPENLLM_MODEL_NAME in it:
-      src_contents[src_contents.index(it)] = (
-          ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
+      src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
    elif OPENLLM_MODEL_ADAPTER_MAP in it:
-      src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(
-          orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
+      src_contents[src_contents.index(it)] = (
+          ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
  if DEBUG: logger.info('Generated script:\n%s', script)
  llm_fs.writetext(llm.config['service_name'], script)
@@ -235,14 +221,12 @@ def create_bento(bento_tag: bentoml.Tag,
  if isinstance(workers_per_resource, str):
    if workers_per_resource == 'round_robin': workers_per_resource = 1.0
    elif workers_per_resource == 'conserved':
-      workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 /
-                                                                                      openllm_core.utils.device_count())
+      workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
    else:
      try:
        workers_per_resource = float(workers_per_resource)
      except ValueError:
-        raise ValueError(
-            "'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
+        raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
  elif isinstance(workers_per_resource, int):
    workers_per_resource = float(workers_per_resource)
  logger.info("Building Bento for '%s'", llm.config['start_name'])
@@ -258,10 +242,8 @@ def create_bento(bento_tag: bentoml.Tag,
                                  exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
                                  python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
                                  models=[llm_spec],
-                                  docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
-                                                                  adapter_map, dockerfile_template,
-                                                                  serialisation_format, container_registry,
-                                                                  container_version_strategy))
+                                  docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template,
+                                                                  serialisation_format, container_registry, container_version_strategy))

  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -68,8 +68,7 @@ def _commit_time_range(r: int = 5) -> str:
 class VersionNotSupported(openllm.exceptions.OpenLLMException):
  """Raised when the stable release is too low that it doesn't include OpenLLM base container."""

-_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple',
-                                                                             ['git_hash', 'version', 'strategy'])
+_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])

 def nightly_resolver(cls: type[RefResolver]) -> str:
  # NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -84,10 +83,8 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
    return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
  # now is the correct behaviour
  return orjson.loads(
-      subprocess.check_output([
-          docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
-          'docker://ghcr.io/bentoml/openllm'
-      ]).decode().strip())['Tags'][-2]
+      subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
+                               'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]

@attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
@@ -107,20 +104,16 @@ class RefResolver:
      # NOTE: This strategy will only support openllm>0.2.12
      meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release()
      version_str = meta['name'].lstrip('v')
-      version: tuple[str,
-                     str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
+      version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
    else:
      version = ('', version_str)
    if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12):
-      raise VersionNotSupported(
-          f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'"
-      )
+      raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
    return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))

  @classmethod
  @functools.lru_cache(maxsize=64)
-  def from_strategy(cls,
-                    strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
+  def from_strategy(cls, strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
    # using default strategy
    if strategy_or_version is None or strategy_or_version == 'release': return cls(*cls._release_ref())
    elif strategy_or_version == 'latest': return cls('latest', '0.0.0', 'latest')
@@ -128,8 +121,7 @@ class RefResolver:
      _ref = cls._nightly_ref()
      return cls(_ref[0], '0.0.0', _ref[-1])
    else:
-      logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.',
-                     strategy_or_version)
+      logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', strategy_or_version)
      return cls(*cls._release_ref(version_str=strategy_or_version))

  @property
@@ -162,8 +154,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
  pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
  if not pyproject_path.exists():
    raise ValueError(
-        "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
-    )
+        "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
  if not registries:
    tags: dict[str | LiteralContainerRegistry, str] = {
        alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
@@ -181,18 +172,14 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
    if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip()
  except Exception as err:
    raise openllm.exceptions.OpenLLMException(
-        f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}'
-    ) from err
+        f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
  return tags

 if t.TYPE_CHECKING:
  CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
  supported_registries: list[str]

-__all__ = [
-    'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries',
-    'RefResolver'
-]
+__all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']

 def __dir__() -> list[str]:
  return sorted(__all__)
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -50,14 +50,10 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete
  ]

 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [
-      sc.CompletionItem(inflection.dasherize(it), help='Model')
-      for it in openllm.CONFIG_MAPPING
-      if it.startswith(incomplete)
-  ]
+  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]

-def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float,
-                         device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
+def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool,
+                         environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
  _bentoml_config_options_opts = [
@@ -67,22 +63,15 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res
  ]
  if device:
    if len(device) > 1:
-      _bentoml_config_options_opts.extend([
-          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
-          for idx, dev in enumerate(device)
-      ])
+      _bentoml_config_options_opts.extend(
+          [f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
    else:
-      _bentoml_config_options_opts.append(
-          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
-  _bentoml_config_options_opts.append(
-      f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
+      _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
+    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
    _bentoml_config_options_opts.extend(
-        ['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
-    _bentoml_config_options_opts.extend([
-        f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
-        for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
-    ])
+        [f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
  if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -104,17 +93,13 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
    ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
  return None

-def start_command_factory(group: click.Group,
-                          model: str,
-                          _context_settings: DictStrAny | None = None,
-                          _serve_grpc: bool = False) -> click.Command:
+def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
  llm_config = openllm.AutoConfig.for_model(model)
-  command_attrs: DictStrAny = dict(
-      name=llm_config['model_name'],
-      context_settings=_context_settings or termui.CONTEXT_SETTINGS,
-      short_help=f"Start a LLMServer for '{model}'",
-      aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None,
-      help=f'''\
+  command_attrs: DictStrAny = dict(name=llm_config['model_name'],
+                                   context_settings=_context_settings or termui.CONTEXT_SETTINGS,
+                                   short_help=f"Start a LLMServer for '{model}'",
+                                   aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None,
+                                   help=f'''\
 {llm_config['env'].start_docstring}

 \b
@@ -133,15 +118,13 @@ Available official model_id(s): [default: {llm_config['default_id']}]
 \b
 {orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()}
 ''',
-  )
+                                   )

  if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
    # NOTE: The model requires GPU, therefore we will return a dummy command
    command_attrs.update({
-        'short_help':
-            '(Disabled because there is no GPU available)',
-        'help':
-            f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
+        'short_help': '(Disabled because there is no GPU available)',
+        'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
    })
    return noop_command(group, llm_config, _serve_grpc, **command_attrs)

@@ -150,12 +133,10 @@ Available official model_id(s): [default: {llm_config['default_id']}]
  @click.pass_context
  def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
-                quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend,
-                serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None,
-                return_process: bool, **attrs: t.Any,
-               ) -> LLMConfig | subprocess.Popen[bytes]:
-    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env(
-        'OPENLLM_SERIALIZATION_WARNING'):
+                quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'],
+                cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
+                ) -> LLMConfig | subprocess.Popen[bytes]:
+    if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
      termui.echo(
          f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
          fg='yellow')
@@ -184,10 +165,7 @@ Available official model_id(s): [default: {llm_config['default_id']}]
      wpr = float(wpr)

    # Create a new model env to work with the envvar during CLI invocation
-    env = openllm.utils.EnvVarMixin(config['model_name'],
-                                    backend,
-                                    model_id=model_id or config['default_id'],
-                                    quantize=quantize)
+    env = openllm.utils.EnvVarMixin(config['model_name'], backend, model_id=model_id or config['default_id'], quantize=quantize)
    requirements = llm_config['requirements']
    if requirements is not None and len(requirements) > 0:
      missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
@@ -218,17 +196,14 @@ Available official model_id(s): [default: {llm_config['default_id']}]
                                                                         serialisation=serialisation_format)
    start_env.update({env.config: llm.config.model_dump_json().decode()})

-    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
-        '_service:svc', **server_attrs)
+    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
    openllm.utils.analytics.track_start_init(llm.config)

    def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
      cmd_name = f'openllm build {model_name}'
      if adapter_map is not None:
-        cmd_name += ' ' + ' '.join([
-            f'--adapter-id {s}'
-            for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
-        ])
+        cmd_name += ' ' + ' '.join(
+            [f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
      if not openllm.utils.get_quiet_mode():
        termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')

@@ -265,17 +240,13 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *
  return noop

 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
-
  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
    composed = openllm.utils.compose(
        llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
-        cog.optgroup.group(
-            'General LLM Options',
-            help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
+        cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
        model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup),
        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-        workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
-        backend_option(factory=cog.optgroup),
+        workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup),
        cog.optgroup.group('LLM Optimization Options',
                           help='''Optimization related options.

@@ -286,7 +257,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
            ''',
-                          ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
+                           ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup),
        cog.optgroup.option('--device',
                            type=openllm.utils.dantic.CUDA,
                            multiple=True,
@@ -312,8 +283,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
    '''),
        cog.optgroup.option('--adapter-id',
                            default=None,
-                            help='Optional name or path for given LoRA adapter' +
-                            f" to wrap '{llm_config['model_name']}'",
+                            help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
                            multiple=True,
                            callback=_id_callback,
                            metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'),
@@ -323,8 +293,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab

  return wrapper

-def parse_device_callback(ctx: click.Context, param: click.Parameter,
-                          value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
+def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
  if value is None: return value
  if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -342,19 +311,15 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]
  from bentoml_cli.cli import cli

  command = 'serve' if not serve_grpc else 'serve-grpc'
-  group = cog.optgroup.group(
-      f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
-      help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
-  )
+  group = cog.optgroup.group(f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
+                             help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
+                             )

  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
    serve_command = cli.commands[command]
    # The first variable is the argument bento
    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [
-        p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
-        if p.name not in _IGNORED_OPTIONS
-    ]
+    serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
@@ -391,10 +356,7 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
 cli_option = functools.partial(_click_factory_type, attr='option')
 cli_argument = functools.partial(_click_factory_type, attr='argument')

-def output_option(f: _AnyCallable | None = None,
-                  *,
-                  default_value: LiteralOutput = 'pretty',
-                  **attrs: t.Any) -> t.Callable[[FC], FC]:
+def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
  output = ['json', 'pretty', 'porcelain']

  def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
@@ -434,12 +396,11 @@ def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable
                    **attrs)(f)

 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--model-version',
-      type=click.STRING,
-      default=None,
-      help='Optional model version to save for this model. It will be inferred automatically from model-id.',
-      **attrs)(f)
+  return cli_option('--model-version',
+                    type=click.STRING,
+                    default=None,
+                    help='Optional model version to save for this model. It will be inferred automatically from model-id.',
+                    **attrs)(f)

 def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip
@@ -453,10 +414,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[
                    **attrs)(f)

 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument('model_name',
-                      type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
-                      required=required,
-                      **attrs)(f)
+  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)

 def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--quantise',
@@ -482,10 +440,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
                    **attrs)(f)

-def workers_per_resource_option(f: _AnyCallable | None = None,
-                                *,
-                                build: bool = False,
-                                **attrs: t.Any) -> t.Callable[[FC], FC]:
+def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--workers-per-resource',
                    default=None,
                    callback=workers_per_resource_callback,
@@ -536,18 +491,16 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
                    **attrs)(f)

 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--container-registry',
-      'container_registry',
-      type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-      default='ecr',
-      show_default=True,
-      show_envvar=True,
-      envvar='OPENLLM_CONTAINER_REGISTRY',
-      callback=container_registry_callback,
-      help=
-      'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
-      **attrs)(f)
+  return cli_option('--container-registry',
+                    'container_registry',
+                    type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
+                    default='ecr',
+                    show_default=True,
+                    show_envvar=True,
+                    envvar='OPENLLM_CONTAINER_REGISTRY',
+                    callback=container_registry_callback,
+                    help='The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker',
+                    **attrs)(f)

 _wpr_strategies = {'round_robin', 'conserved'}

@@ -559,9 +512,8 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
    try:
      float(value)  # type: ignore[arg-type]
    except ValueError:
-      raise click.BadParameter(
-          f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
-          ctx, param) from None
+      raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx,
+                               param) from None
    else:
      return value

--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -84,8 +84,7 @@ def _start(model_name: str,
  from .entrypoint import start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
-                                             backend=openllm_core.utils.first_not_none(
-                                                 backend, default=llm_config.default_backend()),
+                                             backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()),
                                             model_id=model_id,
                                             quantize=quantize)
  os.environ[_ModelEnv.backend] = _ModelEnv['backend_value']
@@ -94,26 +93,19 @@ def _start(model_name: str,
  if model_id: args.extend(['--model-id', model_id])
  if timeout: args.extend(['--server-timeout', str(timeout)])
  if workers_per_resource:
-    args.extend([
-        '--workers-per-resource',
-        str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
-    ])
+    args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
  if quantize: args.extend(['--quantize', str(quantize)])
  if cors: args.append('--cors')
  if adapter_map:
-    args.extend(
-        list(
-            itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()
-                                          ])))
+    args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
  if additional_args: args.extend(additional_args)
  if __test__: args.append('--return-process')

  return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
                               model_name,
                               _context_settings=termui.CONTEXT_SETTINGS,
-                               _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None,
-                                                             standalone_mode=False)
+                               _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False)

@inject
 def _build(model_name: str,
@@ -180,9 +172,7 @@ def _build(model_name: str,
  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [
-      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format
-  ]
+  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format]
  if quantize: args.extend(['--quantize', quantize])
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
  if push: args.extend(['--push'])
@@ -265,8 +255,7 @@ def _list_models() -> dict[str, t.Any]:
  from .entrypoint import models_command
  return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)

-start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(
-    _start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
-        _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
-            _import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
+start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
+    _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
+        _import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
 __all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -34,8 +34,8 @@ if t.TYPE_CHECKING:
              help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
-def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None,
-        version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
+def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool,
+        machine: bool) -> dict[str, str]:
  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -24,10 +24,7 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
-def cli(ctx: click.Context,
-        bento: str,
-        machine: bool,
-        _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
+def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
  '''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
  try:
    bentomodel = _bento_store.get(bento)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -19,9 +19,7 @@ from openllm_core.utils import bentoml_cattr
 if t.TYPE_CHECKING:
  from bentoml._internal.bento import BentoStore

-@click.command('get_containerfile',
-               context_settings=termui.CONTEXT_SETTINGS,
-               help='Return Containerfile of any given Bento.')
+@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -32,8 +32,8 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain']
              callback=opt_callback,
              metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
-def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool,
-        _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
+def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any],
+        **_: t.Any) -> str | None:
  '''Get the default prompt used by OpenLLM.'''
  module = openllm.utils.EnvVarMixin(model_name).module
  _memoized = {k: v[0] for k, v in _memoized.items() if v}
@@ -46,15 +46,11 @@ def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None,
      if format is None:
        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None:
          raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
-        raise click.BadOptionUsage(
-            'format',
-            f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
+        raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
      if prompt_mapping is None:
-        raise click.BadArgumentUsage(
-            f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
+        raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
      if format not in prompt_mapping:
-        raise click.BadOptionUsage(
-            'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
+        raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
      _prompt_template = template(format)
    else:
      _prompt_template = template
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -19,26 +19,28 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
  '''List available bentos built by OpenLLM.'''
  mapping = {
      k: [{
-          'tag':
-              str(b.tag),
-          'size':
-              human_readable_size(openllm.utils.calc_dir_size(b.path)),
+          'tag': str(b.tag),
+          'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
          'models': [{
              'tag': str(m.tag),
              'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
-          } for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
-      } for b in tuple(i for i in bentoml.list() if all(
-          k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k
-         ] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
+          }
+                     for m in (bentoml.models.get(_.tag)
+                               for _ in b.info.models)]
+      }
+          for b in tuple(i
+                         for i in bentoml.list()
+                         if all(k in i.info.labels
+                                for k in {'start_name', 'bundler'}))
+          if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  }
  mapping = {k: v for k, v in mapping.items() if v}
  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(tabulate.tabulate(
-        [(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
-        tablefmt='fancy_grid',
-        headers=['LLM', 'Tag', 'Size', 'Models']),
+    termui.echo(tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
+                                  tablefmt='fancy_grid',
+                                  headers=['LLM', 'Tag', 'Size', 'Models']),
                fg='white')
  else:
    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -26,17 +26,14 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  ids_in_local_store = {
      k: [
-          i for i in bentoml.models.list() if 'framework' in i.info.labels and
-          i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
+          i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and
+          'model_name' in i.info.labels and i.info.labels['model_name'] == k
      ] for k in models
  }
  if model_name is not None:
    ids_in_local_store = {
-        k: [
-            i
-            for i in v
-            if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
-        ] for k, v in ids_in_local_store.items()
+        k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
+        for k, v in ids_in_local_store.items()
    }
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
  local_models = {
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -34,12 +34,7 @@ def load_notebook_metadata() -> DictStrAny:

@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
-@click.option('--port',
-              envvar='JUPYTER_PORT',
-              show_envvar=True,
-              show_default=True,
-              default=8888,
-              help='Default port for Jupyter server')
+@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
@click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  """OpenLLM Playground.
@@ -60,9 +55,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
  """
  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError(
-        "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
-    )
+    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
  metadata = load_notebook_metadata()
  _temp_dir = False
  if output_dir is None:
@@ -74,8 +67,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
  for module in pkgutil.iter_modules(playground.__path__):
    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug('Skipping: %s (%s)', module.name,
-                   'File already exists' if not module.ispkg else f'{module.name} is a module')
+      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
      continue
    if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
    termui.echo('Generating notebook for: ' + module.name, fg='magenta')
@@ -84,10 +76,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
    f.cells.insert(0, markdown_cell)
    jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
  try:
-    subprocess.check_output([
-        sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port',
-        str(port), '--no-browser', '--debug'
-    ])
+    subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug'])
  except subprocess.CalledProcessError as e:
    termui.echo(e.output, fg='red')
    raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -16,9 +16,5 @@ def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.An
    t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)

 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {
-    'help_option_names': ['-h', '--help'],
-    'max_content_width': COLUMNS,
-    'token_normalize_func': inflection.underscore
-}
+CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
 __all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -30,9 +30,7 @@ class BaseAutoLLMClass:
  _model_mapping: t.ClassVar[_LazyAutoMapping]

  def __init__(self, *args: t.Any, **attrs: t.Any):
-    raise EnvironmentError(
-        f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead."
-    )
+    raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")

  @classmethod
  def for_model(cls,
@@ -50,10 +48,7 @@ class BaseAutoLLMClass:
    >>> llm = openllm.AutoLLM.for_model("flan-t5")
    ```
    '''
-    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id,
-                                                           model_version=model_version,
-                                                           llm_config=llm_config,
-                                                           **attrs)
+    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
    if ensure_available: llm.ensure_model_id_exists()
    return llm

@@ -116,9 +111,7 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
  This OrderedDict values() and keys() returns the list instead, so you don't
  have to do list(mapping.values()) to get the list of values.
  """
-
-  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString],
-               model_mapping: OrderedDict[LiteralString, LiteralString]):
+  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
    self._config_mapping = config_mapping
    self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
    self._model_mapping = model_mapping
@@ -153,32 +146,26 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
    return ReprMixin.__repr__(self)

  def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
-    yield from ((key, (value, self._model_mapping[key]))
-                for key, value in self._config_mapping.items()
-                if key in self._model_mapping)
+    yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)

  def __bool__(self) -> bool:
    return bool(self.keys())

  def keys(self) -> ConfigModelKeysView:
-    return t.cast('ConfigModelKeysView', [
-        self._load_attr_from_module(key, name)
-        for key, name in self._config_mapping.items()
-        if key in self._model_mapping.keys()
-    ] + list(self._extra_content.keys()))
+    return t.cast('ConfigModelKeysView',
+                  [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] +
+                  list(self._extra_content.keys()))

  def values(self) -> ConfigModelValuesView:
-    return t.cast('ConfigModelValuesView', [
-        self._load_attr_from_module(key, name)
-        for key, name in self._model_mapping.items()
-        if key in self._config_mapping.keys()
-    ] + list(self._extra_content.values()))
+    return t.cast('ConfigModelValuesView',
+                  [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] +
+                  list(self._extra_content.values()))

  def items(self) -> ConfigModelItemsView:
-    return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(
-        key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
-                                           for key in self._model_mapping.keys()
-                                           if key in self._config_mapping.keys()] + list(self._extra_content.items()))
+    return t.cast('ConfigModelItemsView',
+                  [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
+                   for key in self._model_mapping.keys()
+                   if key in self._config_mapping.keys()] + list(self._extra_content.items()))

  def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
    return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -7,10 +7,9 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass
 from .factory import _LazyAutoMapping

-MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'),
-                                   ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
-                                   ('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'),
-                                   ('baichuan', 'Baichuan')])
+MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'),
+                                   ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'),
+                                   ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)

 class AutoLLM(BaseAutoLLMClass):
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass
 from .factory import _LazyAutoMapping

-MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'),
-                                        ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
-                                        ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
+MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'),
+                                        ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
                                        ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)

--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -11,6 +11,5 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai
    import torch
    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      outputs = self.model.generate(**inputs,
-                                    generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -14,9 +14,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
      self.model.eval()
      # Only use half precision if the model is not yet quantized
      if self.config.use_half_precision: self.model.half()
-      return self.model.chat(self.tokenizer,
-                             prompt,
-                             generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())

  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -11,8 +11,9 @@ from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
 from openllm_core.config.configuration_dolly_v2 import get_special_token_id
 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
 else:
-  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader(
-      'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
+  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(),
+                                                                                                            'transformers'), openllm.utils.LazyLoader(
+                                                                                                                'tf', globals(), 'tensorflow')
 logger = logging.getLogger(__name__)

@overload
@@ -35,22 +36,8 @@ def get_pipeline(model: transformers.PreTrainedModel,
                 **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
  # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
  class InstructionTextGenerationPipeline(transformers.Pipeline):
-
-    def __init__(self,
-                 *args: t.Any,
-                 do_sample: bool = True,
-                 max_new_tokens: int = 256,
-                 top_p: float = 0.92,
-                 top_k: int = 0,
-                 **kwargs: t.Any):
-      super().__init__(*args,
-                       model=model,
-                       tokenizer=tokenizer,
-                       do_sample=do_sample,
-                       max_new_tokens=max_new_tokens,
-                       top_p=top_p,
-                       top_k=top_k,
-                       **kwargs)
+    def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
+      super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

    def _sanitize_parameters(self,
                             return_full_text: bool | None = None,
@@ -59,8 +46,7 @@ def get_pipeline(model: transformers.PreTrainedModel,
      preprocess_params: dict[str, t.Any] = {}
      # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
      # append a newline to yield a single token.  find whatever token is configured for the response key.
-      tokenizer_response_key = next(
-          (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
+      tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
      response_key_token_id = None
      end_key_token_id = None
      if tokenizer_response_key:
@@ -84,17 +70,15 @@ def get_pipeline(model: transformers.PreTrainedModel,
      inputs['instruction_text'] = input_
      return t.cast(t.Dict[str, t.Any], inputs)

-    def _forward(self, input_tensors: dict[str, t.Any],
-                 **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
+    def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
      input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
      if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
      else: in_b = input_ids.shape[0]
-      generated_sequence = self.model.generate(
-          input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
-          attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
-          pad_token_id=self.tokenizer.pad_token_id,
-          **generate_kwargs)
+      generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
+                                               attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
+                                               pad_token_id=self.tokenizer.pad_token_id,
+                                               **generate_kwargs)
      out_b = generated_sequence.shape[0]
      if self.framework == 'pt':
        generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
@@ -162,10 +146,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {
-        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-        'torch_dtype': torch.bfloat16
-    }, {}
+    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
    return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
@@ -176,6 +157,4 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
    llm_config = self.config.model_construct_env(**attrs)
    with torch.inference_mode():
-      return self.model(prompt,
-                        return_full_text=llm_config.return_full_text,
-                        generation_config=llm_config.to_generation_config())
+      return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -4,42 +4,31 @@ import typing as t
 import openllm
 if t.TYPE_CHECKING: import torch, transformers
 else:
-  torch, transformers = openllm.utils.LazyLoader('torch', globals(),
-                                                 'torch'), openllm.utils.LazyLoader('transformers', globals(),
-                                                                                    'transformers')
+  torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')

 class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {
-        'torch_dtype': torch.bfloat16,
-        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None
-    }, {}
+    return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    eos_token_id, inputs = attrs.pop('eos_token_id',
-                                     self.tokenizer.eos_token_id), self.tokenizer(prompt,
-                                                                                  return_tensors='pt').to(self.device)
+    eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      return self.tokenizer.batch_decode(self.model.generate(
-          input_ids=inputs['input_ids'],
-          attention_mask=inputs['attention_mask'],
-          generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
+      return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'],
+                                                             attention_mask=inputs['attention_mask'],
+                                                             generation_config=self.config.model_construct_env(eos_token_id=eos_token_id,
+                                                                                                               **attrs).to_generation_config()),
                                         skip_special_tokens=True)

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
-        prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
-        'stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
+                                                                                                    openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
    result = self.tokenizer.decode(
-        self.model.generate(encoded_inputs['input_ids'],
-                            max_new_tokens=max_new_tokens,
-                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+        self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -11,11 +11,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    with torch.inference_mode():
-      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                              do_sample=True,
-                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-          skip_special_tokens=True)
+      return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                                             do_sample=True,
+                                                             generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                         skip_special_tokens=True)

  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -32,10 +32,9 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
    decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
-    return self.tokenizer.batch_decode(self.model.generate(
-        self.tokenizer(prompt, return_tensors='np')['input_ids'],
-        do_sample=True,
-        generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        decoder_start_token_id=decoder_start_token_id).sequences,
+    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='np')['input_ids'],
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+                                                           decoder_start_token_id=decoder_start_token_id).sequences,
                                       skip_special_tokens=True,
                                       clean_up_tokenization_spaces=True)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -8,8 +8,7 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(self.model.generate(
-        self.tokenizer(prompt, return_tensors='tf').input_ids,
-        do_sample=True,
-        generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids,
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -26,17 +26,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
    return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
                                    num_tokens=int(torch.sum(attention_mask).item()))

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
-        prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
-        'stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
+                                                                                                    openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
    result = self.tokenizer.decode(
-        self.model.generate(encoded_inputs['input_ids'],
-                            max_new_tokens=max_new_tokens,
-                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+        self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -48,11 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
    device_map = attrs.pop('device_map', None)
    attrs.pop('low_cpu_mem_usage', None)
-    config = get_mpt_config(self.model_id,
-                            self.config.max_sequence_length,
-                            self.device,
-                            device_map=device_map,
-                            trust_remote_code=trust_remote_code)
+    config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
    if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
@@ -62,10 +58,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
                                                              device_map=device_map,
                                                              **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag,
-                                             model,
-                                             custom_objects={'tokenizer': tokenizer},
-                                             labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -79,7 +72,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
                            self.device,
                            device_map=device_map,
                            trust_remote_code=trust_remote_code,
-                           )
+                            )
    model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
                                                              config=config,
                                                              trust_remote_code=trust_remote_code,
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -16,12 +16,11 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
  __openllm_internal__ = True

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-    config, tokenizer = transformers.AutoConfig.from_pretrained(
-        self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
+        self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
    return bentoml.transformers.save_model(self.tag,
-                                           transformers.FlaxAutoModelForCausalLM.from_pretrained(
-                                               self.model_id, **attrs),
+                                           transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
                                           custom_objects={'tokenizer': tokenizer},
                                           labels=generate_labels(self))

@@ -45,6 +44,5 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
                                                           do_sample=True,
-                                                           generation_config=self.config.model_construct_env(
-                                                               **attrs).to_generation_config()).sequences,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -18,8 +18,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    with torch.inference_mode():
-      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                              do_sample=True,
-                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-          skip_special_tokens=True)
+      return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                                             do_sample=True,
+                                                             generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                         skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -11,18 +11,16 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
    import transformers
-    config, tokenizer = transformers.AutoConfig.from_pretrained(
-        self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
+        self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
    return bentoml.transformers.save_model(self.tag,
-                                           transformers.TFOPTForCausalLM.from_pretrained(
-                                               self.model_id, trust_remote_code=trust_remote_code, **attrs),
+                                           transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
                                           custom_objects={'tokenizer': tokenizer},
                                           labels=generate_labels(self))

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(
-        self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
-                            do_sample=True,
-                            generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-        skip_special_tokens=True)
+    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -17,11 +17,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN
    import torch
    with torch.inference_mode():
      return [
-          self.tokenizer.decode(
-              self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                                  do_sample=True,
-                                  generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-                                  pad_token_id=self.tokenizer.eos_token_id,
-                                  stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
-              skip_special_tokens=True)
+          self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                                    do_sample=True,
+                                                    generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+                                                    pad_token_id=self.tokenizer.eos_token_id,
+                                                    stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
+                                skip_special_tokens=True)
      ]
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -28,19 +28,10 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
    import transformers
    torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.add_special_tokens({
-        'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
-        'pad_token': EOD
-    })
-    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
-                                                              torch_dtype=torch_dtype,
-                                                              device_map=device_map,
-                                                              **attrs)
+    tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
+    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag,
-                                             model,
-                                             custom_objects={'tokenizer': tokenizer},
-                                             labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -49,26 +40,21 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
    with torch.inference_mode():
      # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
      # NOTE: support fine-tuning starcoder
-      result_tensor = self.model.generate(
-          self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
-          do_sample=True,
-          pad_token_id=self.tokenizer.eos_token_id,
-          generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
+                                          do_sample=True,
+                                          pad_token_id=self.tokenizer.eos_token_id,
+                                          generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      # TODO: We will probably want to return the tokenizer here so that we can manually process this
      # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
      return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
-        prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
-        'stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
+                                                                                                    openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
    result = self.tokenizer.decode(
-        self.model.generate(encoded_inputs['input_ids'],
-                            max_new_tokens=max_new_tokens,
-                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+        self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/playground/falcon_tuned.py
+++ b/openllm-python/src/openllm/playground/falcon_tuned.py
@@ -61,16 +61,13 @@ model, tokenizer = openllm.AutoLLM.for_model("falcon",
                                             quantize="int4",
                                             bnb_4bit_quant_type="nf4",
                                             bnb_4bit_compute_dtype=torch.float16,
-                                             ensure_available=True).prepare_for_training(adapter_type="lora",
-                                                                                         lora_alpha=16,
-                                                                                         lora_dropout=0.1,
-                                                                                         r=16,
-                                                                                         bias="none",
-                                                                                         target_modules=[
-                                                                                             "query_key_value", "dense",
-                                                                                             "dense_h_to_4h",
-                                                                                             "dense_4h_to_h"
-                                                                                         ])
+                                             ensure_available=True).prepare_for_training(
+                                                 adapter_type="lora",
+                                                 lora_alpha=16,
+                                                 lora_dropout=0.1,
+                                                 r=16,
+                                                 bias="none",
+                                                 target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
 model.config.use_cache = False
 tokenizer.pad_token = tokenizer.eos_token

@@ -81,9 +78,8 @@ trainer = SFTTrainer(model=model,
                     dataset_text_field="text",
                     max_seq_length=model_args.max_sequence_length,
                     tokenizer=tokenizer,
-                     args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
-                                              **dataclasses.asdict(training_args)),
-                    )
+                     args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
+                     )

 # upcast layernorm in float32 for more stable training
 for name, module in trainer.model.named_modules():
--- a/openllm-python/src/openllm/playground/llama2_qlora.py
+++ b/openllm-python/src/openllm/playground/llama2_qlora.py
@@ -78,10 +78,7 @@ def chunk(sample, chunk_length=2048):
    batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

  # Split by chunks of max_len.
-  result = {
-      k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
-      for k, t in concatenated_examples.items()
-  }
+  result = {k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] for k, t in concatenated_examples.items()}
  # add remainder to global variable for next batch
  remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
  # prepare labels
@@ -101,8 +98,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
  print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])

  # tokenize and chunk dataset
-  lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]),
-                           batched=True,
+  lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True,
                           remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)

  # Print total number of samples
@@ -113,7 +109,7 @@ def prepare_for_int4_training(model_id: str,
                              model_version: str | None = None,
                              gradient_checkpointing: bool = True,
                              bf16: bool = True,
-                             ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
+                              ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
  from peft.tuners.lora import LoraLayer

  llm = openllm.AutoLLM.for_model("llama",
@@ -124,16 +120,14 @@ def prepare_for_int4_training(model_id: str,
                                  bnb_4bit_compute_dtype=torch.bfloat16,
                                  use_cache=not gradient_checkpointing,
                                  device_map="auto",
-                                 )
+                                  )
  print("Model summary:", llm.model)

  # get lora target modules
  modules = find_all_linear_names(llm.model)
  print(f"Found {len(modules)} modules to quantize: {modules}")

-  model, tokenizer = llm.prepare_for_training(adapter_type="lora",
-                                              use_gradient_checkpointing=gradient_checkpointing,
-                                              target_modules=modules)
+  model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules)

  # pre-process the model by upcasting the layer norms in float 32 for
  for name, module in model.named_modules():
@@ -189,7 +183,7 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
  model, tokenizer = prepare_for_int4_training(model_args.model_id,
                                               gradient_checkpointing=training_args.gradient_checkpointing,
                                               bf16=training_args.bf16,
-                                              )
+                                               )
  datasets = prepare_datasets(tokenizer)

  trainer = transformers.Trainer(model=model,
@@ -197,7 +191,7 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
                                                          **dataclasses.asdict(training_args)),
                                 train_dataset=datasets,
                                 data_collator=transformers.default_data_collator,
-                                )
+                                 )

  trainer.train()

@@ -212,14 +206,10 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
    del model, trainer
    torch.cuda.empty_cache()

-    model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir,
-                                                          low_cpu_mem_usage=True,
-                                                          torch_dtype=torch.float16)
+    model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
    # merge lora with base weights and save
    model = model.merge_and_unload()
-    model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"),
-                          safe_serialization=True,
-                          max_shard_size="2GB")
+    model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB")
  else:
    trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))

--- a/openllm-python/src/openllm/playground/opt_tuned.py
+++ b/openllm-python/src/openllm/playground/opt_tuned.py
@@ -26,14 +26,12 @@ if t.TYPE_CHECKING:

 DEFAULT_MODEL_ID = "facebook/opt-6.7b"

-def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any,
-                 training_args: TrainingArguments):
+def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments):
  return transformers.Trainer(model=model,
                              train_dataset=dataset_dict["train"],
-                              args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
-                                                       **dataclasses.asdict(training_args)),
+                              args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
                              data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
-                             )
+                              )

@dataclasses.dataclass
 class TrainingArguments:
@@ -58,16 +56,13 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
 else:
  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())

-model, tokenizer = openllm.AutoLLM.for_model("opt",
-                                             model_id=model_args.model_id,
-                                             quantize="int8",
-                                             ensure_available=True).prepare_for_training(
-                                                 adapter_type="lora",
-                                                 r=16,
-                                                 lora_alpha=32,
-                                                 target_modules=["q_proj", "v_proj"],
-                                                 lora_dropout=0.05,
-                                                 bias="none")
+model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8",
+                                             ensure_available=True).prepare_for_training(adapter_type="lora",
+                                                                                         r=16,
+                                                                                         lora_alpha=32,
+                                                                                         target_modules=["q_proj", "v_proj"],
+                                                                                         lora_dropout=0.05,
+                                                                                         bias="none")

 # ft on english_quotes
 data = load_dataset("Abirate/english_quotes")
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -59,14 +59,12 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
  return tokenizer

 class _Caller(t.Protocol[P]):
-
  def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
    ...

 _extras = ['get', 'import_model', 'load_model']

 def _make_dispatch_function(fn: str) -> _Caller[P]:
-
  def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
    """Generic function dispatch to correct serialisation submodules based on LLM runtime.

--- a/openllm-python/src/openllm/serialisation/constants.py
+++ b/openllm-python/src/openllm/serialisation/constants.py
@@ -7,6 +7,5 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
    'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
 }
 HUB_ATTRS = [
-    'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision',
-    'subfolder', 'use_auth_token'
+    'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token'
 ]
--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -13,11 +13,7 @@ if t.TYPE_CHECKING:

 _conversion_strategy = {'pt': 'ggml'}

-def import_model(llm: openllm.LLM[t.Any, t.Any],
-                 *decls: t.Any,
-                 trust_remote_code: bool = True,
-                 **attrs: t.Any,
-                ) -> bentoml.Model:
+def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model:
  raise NotImplementedError('Currently work in progress.')

 def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -68,24 +68,18 @@ def import_model(llm: openllm.LLM[M, T],
  config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
  _, tokenizer_attrs = llm.llm_parameters
  quantize_method = llm._quantize_method
-  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
-                                                    default=llm._serialisation_format == 'safetensors')
+  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
  # Disable safe serialization with vLLM
  if llm.__llm_backend__ == 'vllm': safe_serialisation = False
-  metadata: DictStrAny = {
-      'safe_serialisation': safe_serialisation,
-      '_quantize': quantize_method is not None and quantize_method
-  }
+  metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
  signatures: DictStrAny = {}

  if quantize_method == 'gptq':
    if not openllm.utils.is_autogptq_available():
      raise openllm.exceptions.OpenLLMException(
-          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
-      )
+          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
    if llm.config['model_type'] != 'causal_lm':
-      raise openllm.exceptions.OpenLLMException(
-          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+      raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
    signatures['generate'] = {'batchable': False}
  else:
    # this model might be called with --quantize int4, therefore we need to pop this out
@@ -95,10 +89,7 @@ def import_model(llm: openllm.LLM[M, T],
    if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
    metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__

-  tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
-                                                             trust_remote_code=trust_remote_code,
-                                                             **hub_attrs,
-                                                             **tokenizer_attrs)
+  tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
  if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

  external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)]
@@ -117,25 +108,18 @@ def import_model(llm: openllm.LLM[M, T],
      if quantize_method == 'gptq':
        if not openllm.utils.is_autogptq_available():
          raise openllm.exceptions.OpenLLMException(
-              "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
-          )
+              "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
        if llm.config['model_type'] != 'causal_lm':
-          raise openllm.exceptions.OpenLLMException(
-              f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+          raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
        logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
        model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
                                                            *decls,
-                                                            quantize_config=t.cast('autogptq.BaseQuantizeConfig',
-                                                                                   llm.quantization_config),
+                                                            quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
                                                            trust_remote_code=trust_remote_code,
                                                            use_safetensors=safe_serialisation,
                                                            **hub_attrs,
                                                            **attrs)
-        update_model(bentomodel,
-                     metadata={
-                         '_pretrained_class': model.__class__.__name__,
-                         '_framework': model.model.framework
-                     })
+        update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
        model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
      else:
        architectures = getattr(config, 'architectures', [])
@@ -159,18 +143,14 @@ def import_model(llm: openllm.LLM[M, T],
          model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
        else:
          # we will clone the all tings into the bentomodel path without loading model into memory
-          snapshot_download(llm.model_id,
-                            local_dir=bentomodel.path,
-                            local_dir_use_symlinks=False,
-                            ignore_patterns=HfIgnore.ignore_patterns(llm))
+          snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
    except Exception:
      raise
    else:
      bentomodel.flush()  # type: ignore[no-untyped-call]
      bentomodel.save(_model_store)
      openllm.utils.analytics.track(
-          openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module,
-                                                 model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
+          openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
    finally:
      bentomodel.exit_cloudpickle_context(imported_modules)
      # NOTE: We need to free up the cache after importing the model
@@ -189,36 +169,29 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
  try:
    model = bentoml.models.get(llm.tag)
    if Version(model.info.api_version) < Version('v2'):
-      raise openllm.exceptions.OpenLLMException(
-          'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
+      raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.')
    if model.info.labels['backend'] != llm.__llm_backend__:
      raise openllm.exceptions.OpenLLMException(
-          f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}."
-      )
+          f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.")
    return model
  except Exception as err:
    if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code)
-    raise openllm.exceptions.OpenLLMException(
-        f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err
+    raise openllm.exceptions.OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err

 def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
  config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
-  safe_serialization = openllm.utils.first_not_none(t.cast(
-      t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
+  safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
                                                    attrs.pop('safe_serialization', None),
                                                    default=llm._serialisation_format == 'safetensors')
  if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
    if not openllm.utils.is_autogptq_available():
      raise openllm.exceptions.OpenLLMException(
-          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
-      )
+          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
    if llm.config['model_type'] != 'causal_lm':
-      raise openllm.exceptions.OpenLLMException(
-          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+      raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
    return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
                                                       *decls,
-                                                       quantize_config=t.cast('autogptq.BaseQuantizeConfig',
-                                                                              llm.quantization_config),
+                                                       quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
                                                       trust_remote_code=llm.trust_remote_code,
                                                       use_safetensors=safe_serialization,
                                                       **hub_attrs,
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -24,13 +24,11 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import T
 else:
  transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
-                                                      'transformers'), openllm_core.utils.LazyLoader(
-                                                          'torch', globals(), 'torch')
+                                                      'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')

 _object_setattr = object.__setattr__

-def process_config(model_id: str, trust_remote_code: bool,
-                   **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
+def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
  '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.

  Args:
@@ -55,8 +53,7 @@ def process_config(model_id: str, trust_remote_code: bool,
  return config, hub_attrs, attrs

 def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
-  __cls = getattr(transformers,
-                  openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
+  __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
  if __cls is None:
    raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
  return __cls
@@ -105,13 +102,11 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
  infer_fn: tuple[str, ...] = ('__call__',)
  default_config = ModelSignature(batchable=False)
  if llm.__llm_backend__ in {'pt', 'vllm'}:
-    infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
-                 'group_beam_search', 'constrained_beam_search',
-                )
+    infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search',
+                 'constrained_beam_search',
+                 )
  elif llm.__llm_backend__ == 'tf':
-    infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
-                 'contrastive_search',
-                )
+    infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
  else:
    infer_fn += ('generate',)
  return {k: default_config for k in infer_fn}
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -27,10 +27,7 @@ def build_bento(model: str,
    bentoml.bentos.delete(bento.tag)

@contextlib.contextmanager
-def build_container(bento: bentoml.Bento | str | bentoml.Tag,
-                    image_tag: str | None = None,
-                    cleanup: bool = False,
-                    **attrs: t.Any) -> t.Iterator[str]:
+def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]:
  if isinstance(bento, bentoml.Bento): bento_tag = bento.tag
  else: bento_tag = bentoml.Tag.from_taglike(bento)
  if image_tag is None: image_tag = str(bento_tag)
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
@@ -27,8 +27,7 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
      'serialisation_format': llm._serialisation_format
  }

-def infer_auto_class(
-    backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
+def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
  import openllm
  if backend == 'tf': return openllm.AutoTFLLM
  elif backend == 'flax': return openllm.AutoFlaxLLM
@@ -36,10 +35,7 @@ def infer_auto_class(
  elif backend == 'vllm': return openllm.AutoVLLM
  else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')")

-__all__ = [
-    'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
-    'dummy_vllm_objects'
-]
+__all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects']

 def __dir__() -> t.Sequence[str]:
  return sorted(__all__)
--- a/openllm-python/tests/_strategies/_configuration.py
+++ b/openllm-python/tests/_strategies/_configuration.py
@@ -16,39 +16,26 @@ env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_na
 def model_settings(draw: st.DrawFn):
  '''Strategy for generating ModelSettings objects.'''
  kwargs: dict[str, t.Any] = {
-      'default_id':
-          st.text(min_size=1),
-      'model_ids':
-          st.lists(st.text(), min_size=1),
-      'architecture':
-          st.text(min_size=1),
-      'url':
-          st.text(),
-      'requires_gpu':
-          st.booleans(),
-      'trust_remote_code':
-          st.booleans(),
-      'requirements':
-          st.none() | st.lists(st.text(), min_size=1),
-      'default_backend':
-          st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
-      'model_type':
-          st.sampled_from(['causal_lm', 'seq2seq_lm']),
-      'name_type':
-          st.sampled_from(['dasherize', 'lowercase']),
-      'timeout':
-          st.integers(min_value=3600),
-      'workers_per_resource':
-          st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
+      'default_id': st.text(min_size=1),
+      'model_ids': st.lists(st.text(), min_size=1),
+      'architecture': st.text(min_size=1),
+      'url': st.text(),
+      'requires_gpu': st.booleans(),
+      'trust_remote_code': st.booleans(),
+      'requirements': st.none() | st.lists(st.text(), min_size=1),
+      'default_backend': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
+      'model_type': st.sampled_from(['causal_lm', 'seq2seq_lm']),
+      'name_type': st.sampled_from(['dasherize', 'lowercase']),
+      'timeout': st.integers(min_value=3600),
+      'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
  }
  return draw(st.builds(ModelSettings, **kwargs))

-def make_llm_config(
-    cls_name: str,
-    dunder_config: dict[str, t.Any] | ModelSettings,
-    fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None,
-    generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None,
-) -> type[openllm.LLMConfig]:
+def make_llm_config(cls_name: str,
+                    dunder_config: dict[str, t.Any] | ModelSettings,
+                    fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None,
+                    generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None,
+                    ) -> type[openllm.LLMConfig]:
  globs: dict[str, t.Any] = {'openllm': openllm}
  _config_args: list[str] = []
  lines: list[str] = [f'class {cls_name}Config(openllm.LLMConfig):']
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -24,21 +24,19 @@ from ._strategies._configuration import make_llm_config
 from ._strategies._configuration import model_settings

 # XXX: @aarnphm fixes TypedDict behaviour in 3.11
-@pytest.mark.skipif(sys.version_info[:2] == (3, 11),
-                    reason='TypedDict in 3.11 behaves differently, so we need to fix this')
+@pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this')
 def test_missing_default():
  with pytest.raises(ValueError, match='Missing required fields *'):
    make_llm_config('MissingDefaultId', {'name_type': 'lowercase', 'requirements': ['bentoml']})
  with pytest.raises(ValueError, match='Missing required fields *'):
    make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
  with pytest.raises(ValueError, match='Missing required fields *'):
-    make_llm_config(
-        'MissingArchitecture', {
-            'default_id': 'huggingface/t5-tiny-testing',
-            'model_ids': ['huggingface/t5-tiny-testing'],
-            'requirements': ['bentoml'],
-        },
-    )
+    make_llm_config('MissingArchitecture', {
+        'default_id': 'huggingface/t5-tiny-testing',
+        'model_ids': ['huggingface/t5-tiny-testing'],
+        'requirements': ['bentoml'],
+    },
+                    )

 def test_forbidden_access():
  cl_ = make_llm_config(
@@ -79,16 +77,11 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
  cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
  assert attr.has(cl_)

-@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
-       st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
-      )
-def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int,
-                             input_temperature: float):
-  cl_ = make_llm_config('ComplexLLM',
-                        gen_settings,
-                        fields=(('field1', 'float', field1),),
-                        generation_fields=(('temperature', temperature),),
-                       )
+@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473),
+       st.floats(min_value=0.0, max_value=1.0),
+       )
+def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
+  cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
  sent = cl_()
  assert sent.model_dump()['field1'] == field1
  assert sent.model_dump()['generation_config']['temperature'] == temperature
@@ -129,7 +122,6 @@ def test_struct_envvar():
    assert overwrite_default['temperature'] == 0.2

 def test_struct_provided_fields():
-
  class EnvLLM(openllm.LLMConfig):
    __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
    field1: int = 2
@@ -151,7 +143,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
        'architecture': 'PreTrainedModel'
    },
                           fields=(('field1', 'float', 3.0),),
-                          ).model_construct_env(field1=20.0, temperature=0.4)
+                           ).model_construct_env(field1=20.0, temperature=0.4)
    assert sent.generation_config.temperature == 0.4
    assert sent.field1 == 20.0

--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -10,35 +10,22 @@ import openllm
 if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralBackend

-_MODELING_MAPPING = {
-    'flan_t5': 'google/flan-t5-small',
-    'opt': 'facebook/opt-125m',
-    'baichuan': 'baichuan-inc/Baichuan-7B',
-}
-_PROMPT_MAPPING = {
-    'qa':
-        'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',
-}
+_MODELING_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',}
+_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',}

-def parametrise_local_llm(
-    model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
+def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
  if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
  backends: tuple[LiteralBackend, ...] = tuple()
  if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',)
  if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',)
  if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',)
  for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()):
-    yield prompt, openllm.Runner(model,
-                                 model_id=_MODELING_MAPPING[model],
-                                 ensure_available=True,
-                                 backend=backend,
-                                 init_local=True)
+    yield prompt, openllm.Runner(model, model_id=_MODELING_MAPPING[model], ensure_available=True, backend=backend, init_local=True)

 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
  if os.getenv('GITHUB_ACTIONS') is None:
    if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
-      metafunc.parametrize('prompt,llm',
-                           [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
+      metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])

 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
  # If no tests are collected, pytest exists with code 5, which makes the CI fail.
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -40,13 +40,7 @@ if t.TYPE_CHECKING:
  from openllm.client import BaseAsyncClient

 class ResponseComparator(JSONSnapshotExtension):
-
-  def serialize(self,
-                data: SerializableData,
-                *,
-                exclude: PropertyFilter | None = None,
-                matcher: PropertyMatcher | None = None,
-               ) -> SerializedData:
+  def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData:
    if openllm.utils.LazyType(ListAny).isinstance(data):
      data = [d.unmarshaled for d in data]
    else:
@@ -55,7 +49,6 @@ class ResponseComparator(JSONSnapshotExtension):
    return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()

  def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool:
-
    def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]:
      try:
        data = orjson.loads(data)
@@ -83,8 +76,7 @@ class ResponseComparator(JSONSnapshotExtension):
      return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and
              eq_config(s.marshaled_config, t.marshaled_config))

-    return len(serialized_data) == len(snapshot_data) and all(
-        [eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
+    return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])

@pytest.fixture()
 def response_snapshot(snapshot: SnapshotAssertion):
@@ -133,14 +125,8 @@ class LocalHandle(_Handle):
    return self.process.poll() is None

 class HandleProtocol(t.Protocol):
-
  @contextlib.contextmanager
-  def __call__(*,
-               model: str,
-               model_id: str,
-               image_tag: str,
-               quantize: t.AnyStr | None = None,
-              ) -> t.Generator[_Handle, None, None]:
+  def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]:
    ...

@attr.define(init=False)
@@ -148,9 +134,7 @@ class DockerHandle(_Handle):
  container_name: str
  docker_client: docker.DockerClient

-  def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int,
-               deployment_mode: t.Literal['container', 'local'],
-              ):
+  def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int, deployment_mode: t.Literal['container', 'local'],):
    self.__attrs_init__(port, deployment_mode, container_name, docker_client)

  def status(self) -> bool:
@@ -165,22 +149,14 @@ def _local_handle(model: str,
                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
                  *,
                  _serve_grpc: bool = False,
-                 ):
+                  ):
  with openllm.utils.reserve_free_port() as port:
    pass

  if not _serve_grpc:
-    proc = openllm.start(model,
-                         model_id=model_id,
-                         quantize=quantize,
-                         additional_args=['--port', str(port)],
-                         __test__=True)
+    proc = openllm.start(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
  else:
-    proc = openllm.start_grpc(model,
-                              model_id=model_id,
-                              quantize=quantize,
-                              additional_args=['--port', str(port)],
-                              __test__=True)
+    proc = openllm.start_grpc(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)

  yield LocalHandle(proc, port, deployment_mode)
  proc.terminate()
@@ -201,7 +177,7 @@ def _container_handle(model: str,
                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
                      *,
                      _serve_grpc: bool = False,
-                     ):
+                      ):
  envvar = openllm.utils.EnvVarMixin(model)

  with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
@@ -237,7 +213,7 @@ def _container_handle(model: str,
                                        '3000/tcp': port,
                                        '3001/tcp': prom_port
                                    },
-                                   )
+                                    )

  yield DockerHandle(client, container.name, port, deployment_mode)

--- a/openllm-python/tests/models/flan_t5_test.py
+++ b/openllm-python/tests/models/flan_t5_test.py
@@ -16,11 +16,8 @@ model = 'flan_t5'
 model_id = 'google/flan-t5-small'

@pytest.fixture(scope='module')
-def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
-                   clean_context: contextlib.ExitStack,
-                  ):
-  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
-                               clean_context=clean_context) as image_tag:
+def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
+  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
    with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
      yield handle

--- a/openllm-python/tests/models/opt_test.py
+++ b/openllm-python/tests/models/opt_test.py
@@ -16,11 +16,8 @@ model = 'opt'
 model_id = 'facebook/opt-125m'

@pytest.fixture(scope='module')
-def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
-                    clean_context: contextlib.ExitStack,
-                   ):
-  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
-                               clean_context=clean_context) as image_tag:
+def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
+  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
    with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
      yield handle

--- a/openllm-python/tests/package_test.py
+++ b/openllm-python/tests/package_test.py
@@ -15,11 +15,10 @@ if t.TYPE_CHECKING:

 HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5'

-actions_xfail = functools.partial(
-    pytest.mark.xfail,
-    condition=os.getenv('GITHUB_ACTIONS') is not None,
-    reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
-)
+actions_xfail = functools.partial(pytest.mark.xfail,
+                                  condition=os.getenv('GITHUB_ACTIONS') is not None,
+                                  reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
+                                  )

@actions_xfail
 def test_general_build_with_internal_testing():
@@ -51,8 +50,7 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
 def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
  file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
  file.write_text(
-      "{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}"
-  )
+      "{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
  return file

@pytest.mark.usefixtures('dockerfile_template')
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -71,11 +71,9 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
    mcls.setenv('CUDA_VISIBLE_DEVICES', '')
    assert len(NvidiaGpuResource.from_system()) >= 0  # TODO: real from_system tests

-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],
-                        ).match('Input list should be all string type.')
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.')
    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
-    assert pytest.raises(ValueError, NvidiaGpuResource.validate,
-                         ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')

 def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
  with monkeypatch.context() as mcls:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -261,51 +261,28 @@ ignore_patterns = [
 based_on_style = "google"
 INDENT_WIDTH = 2
 JOIN_MULTIPLE_LINES = true
-COLUMN_LIMIT = 120
+COLUMN_LIMIT = 152
 USE_TABS = false
 BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1
 DISABLE_ENDING_COMMA_HEURISTIC = true
-# ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
-# ALLOW_MULTILINE_DICTIONARY_KEYS = false
-# ALLOW_MULTILINE_LAMBDAS = false
-# ALLOW_SPLIT_BEFORE_DEFAULT_OR_NAMED_ASSIGNS = false
-# ALLOW_SPLIT_BEFORE_DICT_VALUE = false
-# ARITHMETIC_PRECEDENCE_INDICATION = true
-# BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
-# BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
-# BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
-# DEDENT_CLOSING_BRACKETS = true
-# INDENT_CLOSING_BRACKETS = false
-# COALESCE_BRACKETS = true
-# EACH_DICT_ENTRY_ON_SEPARATE_LINE = true
-# CONTINUATION_ALIGN_STYLE = "SPACE"
-# INDENT_BLANK_LINES = false
-# NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true
-# SPACES_AROUND_SUBSCRIPT_COLON = false
-# SPACES_AROUND_DICT_DELIMITERS = false
-# SPACES_AROUND_LIST_DELIMITERS = false
-# SPACES_AROUND_POWER_OPERATOR = false
-# SPACES_AROUND_TUPLE_DELIMITERS = false
-# SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false
-# SPACE_INSIDE_BRACKETS = false
-# SPLIT_ALL_COMMA_SEPARATED_VALUES = false
-# SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = true
-# SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED = false
-# SPLIT_BEFORE_BITWISE_OPERATOR = false
-# SPLIT_BEFORE_CLOSING_BRACKET = false
-# SPLIT_BEFORE_DICT_SET_GENERATOR = false
-# SPLIT_BEFORE_DOT = true
-# SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = false
-# SPLIT_BEFORE_FIRST_ARGUMENT = false
-# SPLIT_BEFORE_LOGICAL_OPERATOR = false
-# SPLIT_BEFORE_NAMED_ASSIGNS = false
-# SPLIT_COMPLEX_COMPREHENSION = true
-# SPLIT_PENALTY_IMPORT_NAMES = 10000
-# SPLIT_PENALTY_AFTER_OPENING_BRACKET = 350
-# SPLIT_PENALTY_BEFORE_IF_EXPR = 10000
-# SPLIT_PENALTY_COMPREHENSION = 2500
-# SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 5000
+BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
+BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
+ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
+ALLOW_MULTILINE_DICTIONARY_KEYS = false
+ALLOW_SPLIT_BEFORE_DICT_VALUE = false
+COALESCE_BRACKETS = true
+NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true
+SPACES_AROUND_SUBSCRIPT_COLON = false
+SPACES_AROUND_DICT_DELIMITERS = false
+SPACES_AROUND_LIST_DELIMITERS = false
+SPACES_AROUND_POWER_OPERATOR = false
+SPACES_AROUND_TUPLE_DELIMITERS = false
+SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false
+SPACE_INSIDE_BRACKETS = false
+SPLIT_ALL_COMMA_SEPARATED_VALUES = false
+SPLIT_BEFORE_DOT = true

 [tool.pytest.ini_options]
 addopts = ["-rfEX", "-pno:warnings", "--snapshot-warn-unused"]
--- a/tools/dependencies.py
+++ b/tools/dependencies.py
@@ -29,10 +29,8 @@ class Classifier:
  @staticmethod
  def status() -> dict[int, str]:
    return {
-        v: status for v, status in zip(range(1, 8), [
-            '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature',
-            '7 - Inactive'
-        ])
+        v: status for v, status in zip(range(
+            1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive'])
    }

  @staticmethod
@@ -47,14 +45,10 @@ class Classifier:
    return cls_.joiner.join([cls_.identifier[identifier], *decls])

  @staticmethod
-  def create_python_classifier(implementation: list[str] | None = None,
-                               supported_version: list[str] | None = None) -> list[str]:
+  def create_python_classifier(implementation: list[str] | None = None, supported_version: list[str] | None = None) -> list[str]:
    if supported_version is None: supported_version = ['3.8', '3.9', '3.10', '3.11', '3.12']
    if implementation is None: implementation = ['CPython', 'PyPy']
-    base = [
-        Classifier.create_classifier('language', 'Python'),
-        Classifier.create_classifier('language', 'Python', '3'),
-    ]
+    base = [Classifier.create_classifier('language', 'Python'), Classifier.create_classifier('language', 'Python', '3'),]
    base.append(Classifier.create_classifier('language', 'Python', '3', 'Only'))
    base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version])
    base.extend([Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation])
@@ -153,8 +147,7 @@ _locals = locals().copy()

 # NOTE: update this table when adding new external dependencies
 # sync with openllm.utils.OPTIONAL_DEPENDENCIES
-_base_requirements.update(
-    {v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES})
+_base_requirements.update({v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES})

 _base_requirements = {k: v for k, v in sorted(_base_requirements.items())}

@@ -187,10 +180,7 @@ def create_optional_table() -> Table:
  all_array.append(f"openllm[{','.join(_base_requirements)}]")

  table = tomlkit.table(is_super_table=True)
-  _base_requirements.update({
-      'full': correct_style(all_array.multiline(True)),
-      'all': tomlkit.array('["openllm[full]"]')
-  })
+  _base_requirements.update({'full': correct_style(all_array.multiline(True)), 'all': tomlkit.array('["openllm[full]"]')})
  table.update({k: v for k, v in sorted(_base_requirements.items())})
  table.add(tomlkit.nl())

@@ -228,9 +218,8 @@ def authors() -> Array:
 def keywords() -> Array:
  arr = correct_style(tomlkit.array())
  arr.extend([
-      'MLOps', 'AI', 'BentoML', 'Model Serving', 'Model Deployment', 'LLMOps', 'Falcon', 'Vicuna', 'Llama 2',
-      'Fine tuning', 'Serverless', 'Large Language Model', 'Generative AI', 'StableLM', 'Alpaca', 'PyTorch',
-      'Transformers'
+      'MLOps', 'AI', 'BentoML', 'Model Serving', 'Model Deployment', 'LLMOps', 'Falcon', 'Vicuna', 'Llama 2', 'Fine tuning', 'Serverless',
+      'Large Language Model', 'Generative AI', 'StableLM', 'Alpaca', 'PyTorch', 'Transformers'
  ])
  return arr.multiline(True)

@@ -240,8 +229,7 @@ def build_cli_extensions() -> Table:
  ext.update({
      f'openllm-{inflection.dasherize(ke)}': f'openllm.cli.extension.{ke}:cli' for ke in sorted([
          fname[:-3]
-          for fname in os.listdir(
-              os.path.abspath(os.path.join(ROOT, 'openllm-python', 'src', 'openllm', 'cli', 'extension')))
+          for fname in os.listdir(os.path.abspath(os.path.join(ROOT, 'openllm-python', 'src', 'openllm', 'cli', 'extension')))
          if fname.endswith('.py') and not fname.startswith('__')
      ])
  })
--- a/tools/update-brew-tap.py
+++ b/tools/update-brew-tap.py
@@ -21,8 +21,7 @@ _gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str]
    'linux_intel': 'x86_64-unknown-linux-musl'
 }

-def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel',
-                                                                    'archive']) -> str:
+def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str:
  if target == 'archive': return f'{svn_url}/archive/{tag}.tar.gz'
  return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz"

@@ -36,11 +35,9 @@ def main() -> int:
  release_tag = api.repos.get_latest_release().name

  shadict: dict[str, t.Any] = {
-      k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip()
-      for k in _gz_strategies
+      k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies
  }
-  shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'),
-                                                release_tag)().strip()
+  shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'), release_tag)().strip()

  ENVIRONMENT = Environment(extensions=['jinja2.ext.do', 'jinja2.ext.loopcontrols', 'jinja2.ext.debug'],
                            trim_blocks=True,
--- a/tools/update-config-stubs.py
+++ b/tools/update-config-stubs.py
@@ -24,14 +24,12 @@ def process_annotations(annotations: str) -> str:
  else: return annotations

 _value_docstring = {
-    'default_id':
-        '''Return the default model to use when using 'openllm start <model_id>'.
+    'default_id': '''Return the default model to use when using 'openllm start <model_id>'.
        This could be one of the keys in 'self.model_ids' or custom users model.

        This field is required when defining under '__config__'.
        ''',
-    'model_ids':
-        '''A list of supported pretrained models tag for this given runnable.
+    'model_ids': '''A list of supported pretrained models tag for this given runnable.

        For example:
            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
@@ -39,8 +37,7 @@ _value_docstring = {

        This field is required when defining under '__config__'.
        ''',
-    'architecture':
-        '''The model architecture that is supported by this LLM.
+    'architecture': '''The model architecture that is supported by this LLM.

        Note that any model weights within this architecture generation can always be run and supported by this LLM.

@@ -50,34 +47,21 @@ _value_docstring = {
            ```bash
            openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
            ```''',
-    'default_backend':
-        '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''',
-    'url':
-        'The resolved url for this LLMConfig.',
-    'requires_gpu':
-        'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.',
-    'trust_remote_code':
-        'Whether to always trust remote code',
-    'service_name':
-        "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"",
-    'requirements':
-        'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.',
-    'model_type':
-        'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"',
-    'name_type':
-        '''The default name typed for this model. "dasherize" will convert the name to lowercase and
+    'default_backend': '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''',
+    'url': 'The resolved url for this LLMConfig.',
+    'requires_gpu': 'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.',
+    'trust_remote_code': 'Whether to always trust remote code',
+    'service_name': "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"",
+    'requirements': 'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.',
+    'model_type': 'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"',
+    'name_type': '''The default name typed for this model. "dasherize" will convert the name to lowercase and
        replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
        `model_name` and `start_name` must be specified.''',
-    'model_name':
-        'The normalized version of __openllm_start_name__, determined by __openllm_name_type__',
-    'start_name':
-        'Default name to be used with `openllm start`',
-    'env':
-        'A EnvVarMixin instance for this LLMConfig.',
-    'timeout':
-        'The default timeout to be set for this given LLM.',
-    'workers_per_resource':
-        '''The number of workers per resource. This is used to determine the number of workers to use for this model.
+    'model_name': 'The normalized version of __openllm_start_name__, determined by __openllm_name_type__',
+    'start_name': 'Default name to be used with `openllm start`',
+    'env': 'A EnvVarMixin instance for this LLMConfig.',
+    'timeout': 'The default timeout to be set for this given LLM.',
+    'workers_per_resource': '''The number of workers per resource. This is used to determine the number of workers to use for this model.
        For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
        OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.

@@ -86,10 +70,8 @@ _value_docstring = {

        By default, it is set to 1.
        ''',
-    'fine_tune_strategies':
-        'The fine-tune strategies for this given LLM.',
-    'tokenizer_class':
-        'Optional tokenizer class for this given LLM. See Llama for example.',
+    'fine_tune_strategies': 'The fine-tune strategies for this given LLM.',
+    'tokenizer_class': 'Optional tokenizer class for this given LLM. See Llama for example.',
 }

 _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}
@@ -99,16 +81,13 @@ def main() -> int:
    processed = f.readlines()

  start_idx, end_idx = processed.index(' ' * 2 + START_COMMENT), processed.index(' ' * 2 + END_COMMENT)
-  start_stub_idx, end_stub_idx = processed.index(' ' * 4 + START_SPECIAL_COMMENT), processed.index(' ' * 4 +
-                                                                                                   END_SPECIAL_COMMENT)
-  start_attrs_idx, end_attrs_idx = processed.index(' ' * 4 + START_ATTRS_COMMENT), processed.index(' ' * 4 +
-                                                                                                   END_ATTRS_COMMENT)
+  start_stub_idx, end_stub_idx = processed.index(' ' * 4 + START_SPECIAL_COMMENT), processed.index(' ' * 4 + END_SPECIAL_COMMENT)
+  start_attrs_idx, end_attrs_idx = processed.index(' ' * 4 + START_ATTRS_COMMENT), processed.index(' ' * 4 + END_ATTRS_COMMENT)

  # NOTE: inline stubs __config__ attrs representation
  special_attrs_lines: list[str] = []
  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
-    special_attrs_lines.append(
-        f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
+    special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
  # NOTE: inline stubs for _ConfigAttr type stubs
  config_attr_lines: list[str] = []
  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
@@ -132,40 +111,28 @@ def main() -> int:
  lines.append(' ' * 2 + '# NOTE: generation_class, sampling_class and extras arguments\n')
  lines.extend([
      ' ' * 2 + line for line in [
-          '@overload\n',
-          "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n",
-          '@overload\n',
-          "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n",
-          '@overload\n', "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n",
+          '@overload\n', "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", '@overload\n',
+          "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", '@overload\n',
+          "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n",
      ]
  ])
  lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n')
  generation_config_anns = codegen.get_annotations(GenerationConfig)
  for keys, type_pep563 in generation_config_anns.items():
-    lines.extend([
-        ' ' * 2 + line
-        for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]
-    ])
+    lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]])
  lines.append(' ' * 2 + '# NOTE: SamplingParams arguments\n')
  for keys, type_pep563 in codegen.get_annotations(SamplingParams).items():
    if keys not in generation_config_anns:
-      lines.extend([
-          ' ' * 2 + line
-          for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",]
-      ])
+      lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",]])
  lines.append(' ' * 2 + '# NOTE: PeftType arguments\n')
  for keys in PeftType._member_names_:
-    lines.extend([
-        ' ' * 2 + line for line in
-        ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]
-    ])
+    lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]])

-  processed = processed[:start_attrs_idx] + [
-      ' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT
-  ] + processed[end_attrs_idx + 1:start_stub_idx] + [
-      ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT
-  ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT
-                                              ] + processed[end_idx + 1:]
+  processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT
+                                             ] + processed[end_attrs_idx + 1:start_stub_idx] + [
+                                                 ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT
+                                             ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT
+                                                                                          ] + processed[end_idx + 1:]
  with _TARGET_FILE.open('w') as f:
    f.writelines(processed)
  return 0
--- a/tools/update-dummy.py
+++ b/tools/update-dummy.py
@@ -14,15 +14,10 @@ from openllm import CONFIG_MAPPING
 if t.TYPE_CHECKING: from collections import OrderedDict

 config_requirements = {
-    k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None
-    for k, v in CONFIG_MAPPING.items()
-}
-_dependencies: dict[LiteralBackend, str] = {
-    k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm'))
-}
-_auto: dict[str, str] = {
-    k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))
+    k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items()
 }
+_dependencies: dict[LiteralBackend, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm'))}
+_auto: dict[str, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))}

 def get_target_dummy_file(backend: LiteralBackend) -> Path:
  return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{backend}_objects.py'
@@ -36,34 +31,29 @@ def get_mapping(backend: LiteralBackend) -> OrderedDict[t.Any, t.Any]:
 def make_class_stub(model_name: str, backend: LiteralBackend, indentation: int = 2, auto: bool = False) -> list[str]:
  _dep_list: list[str] = [
      f'"{v}"' for v in [
-          _dependencies[backend], *(t.cast(t.List[str], config_requirements[model_name]
-                                          ) if model_name != '__default__' and config_requirements[model_name] else [])
+          _dependencies[backend], *(
+              t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else [])
      ]
  ]
  if auto: cl_ = _auto[backend]
  else: cl_ = get_mapping(backend)[model_name]
  lines = [
      f'class {cl_}(metaclass=_DummyMetaclass):', ' ' * indentation + f"_backends=[{','.join(_dep_list)}]",
-      ' ' * indentation +
-      f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])"
+      ' ' * indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])"
  ]
  return lines

 def write_stub(backend: LiteralBackend, _path: str) -> list[str]:
  base = [
-      f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}',
-      'from __future__ import annotations', 'import typing as _t',
-      'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends',
+      f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations',
+      'import typing as _t', 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends',
  ]
  base.extend([v for it in [make_class_stub(k, backend) for k in get_mapping(backend)] for v in it])
  # autoclass
  base.extend(make_class_stub('__default__', backend, auto=True))
  # mapping and export
  _imports = [f'"{v}"' for v in get_mapping(backend).values()]
-  base += [
-      f'{mapping_names(backend)}:_t.Any=None',
-      f"__all__:list[str]=[\"{mapping_names(backend)}\",\"{_auto[backend]}\",{','.join(_imports)}]\n"
-  ]
+  base += [f'{mapping_names(backend)}:_t.Any=None', f"__all__:list[str]=[\"{mapping_names(backend)}\",\"{_auto[backend]}\",{','.join(_imports)}]\n"]
  return base

 def main() -> int:
--- a/tools/update-models-import.py
+++ b/tools/update-models-import.py
@@ -6,27 +6,23 @@ from pathlib import Path
 _TARGET_FILE = Path(__file__).parent.parent / 'openllm-python' / 'src' / 'openllm' / 'models' / '__init__.py'

 def create_module_import() -> str:
-  r = [
-      f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/')
-      if p.name not in ['__pycache__', '__init__.py', '.DS_Store']
-  ]
+  r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']]
  return f"_MODELS:set[str]={{{', '.join(sorted(r))}}}"

 def create_stubs_import() -> list[str]:
  return [
-      'if t.TYPE_CHECKING:from . import ' + ','.join([
-          f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/'))
-          if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}
-      ]), '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', '__all__=__lazy.__all__',
-      '__dir__=__lazy.__dir__', '__getattr__=__lazy.__getattr__\n'
+      'if t.TYPE_CHECKING:from . import ' +
+      ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]),
+      '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', '__all__=__lazy.__all__', '__dir__=__lazy.__dir__',
+      '__getattr__=__lazy.__getattr__\n'
  ]

 def main() -> int:
  _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__))
  with _TARGET_FILE.open('w') as f:
    f.writelines('\n'.join([
-        f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}',
-        'from __future__ import annotations', 'import typing as t', 'from openllm_core.utils import LazyModule',
+        f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations',
+        'import typing as t', 'from openllm_core.utils import LazyModule',
        create_module_import(), *create_stubs_import(),
    ]))
  return 0
--- a/tools/update-readme.py
+++ b/tools/update-readme.py
@@ -42,8 +42,7 @@ def main() -> int:
  meta.extend([f'<th>{header}</th>\n' for header in formatted.keys() if header not in ('URL',)])
  meta += ['</tr>\n']
  # NOTE: rows
-  for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]],
-                                                                 zip(*formatted.values())):
+  for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]], zip(*formatted.values())):
    meta += '<tr>\n'
    # configure architecture URL
    cfg_cls = openllm.CONFIG_MAPPING[name]
--- a/tools/write-coverage-report.py
+++ b/tools/write-coverage-report.py
@@ -31,8 +31,7 @@ def main() -> int:
  color = 'ok' if float(total_rate) >= 95 else 'critical'
  lines.insert(0, f'![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n')

-  lines.append(
-      f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n')
+  lines.append(f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n')

  coverage_report = ROOT / 'coverage-report.md'
  with coverage_report.open('w', encoding='utf-8') as f: