style: google

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-08-30 13:52:00 -04:00
parent e2ba6a92a6
commit b545ad2ad1
98 changed files with 3514 additions and 2094 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -26,11 +26,14 @@ else:
  # configuration for bitsandbytes before import
  _os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1")
  # NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False
-  _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
-  _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
+  _warnings.filterwarnings(
+      "ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization")
+  _warnings.filterwarnings(
+      "ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization")
  _warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.")
  # NOTE: ignore the following warning from ghapi as it is not important for users
-  _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
+  _warnings.filterwarnings("ignore",
+                           message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")

 _import_structure: dict[str, list[str]] = {
    "exceptions": [],
@@ -45,8 +48,13 @@ _import_structure: dict[str, list[str]] = {
    "_quantisation": ["infer_quantisation_config"],
    "_embeddings": ["GenericEmbeddingRunnable"],
    "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"],
-    "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
-    "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"],
+    "_generation": [
+        "StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList",
+        "prepare_logits_processor"
+    ],
+    "models.auto": [
+        "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"
+    ],
    "models.chatglm": [],
    "models.baichuan": [],
    "models.dolly_v2": [],
@@ -73,7 +81,8 @@ if _t.TYPE_CHECKING:
  from .utils import infer_auto_class as infer_auto_class

 try:
-  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()):
+    raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
 else:
@@ -83,7 +92,8 @@ else:
    from .models.baichuan import Baichuan as Baichuan
    from .models.chatglm import ChatGLM as ChatGLM
 try:
-  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()):
+    raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
  else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
@@ -91,7 +101,8 @@ else:
  _import_structure["models.mpt"].extend(["MPT"])
  if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
 try:
-  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()): raise exceptions.MissingDependencyError
+  if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()):
+    raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
  else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
@@ -103,7 +114,8 @@ try:
  if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
  _import_structure["utils.dummy_pt_objects"] = [
-      name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
+      name for name in dir(utils.dummy_pt_objects)
+      if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")
  ]
 else:
  _import_structure["models.flan_t5"].extend(["FlanT5"])
@@ -126,7 +138,9 @@ else:
 try:
  if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
-  _import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
+  _import_structure["utils.dummy_vllm_objects"] = [
+      name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)
+  ]
 else:
  _import_structure["models.baichuan"].extend(["VLLMBaichuan"])
  _import_structure["models.llama"].extend(["VLLMLlama"])
@@ -152,7 +166,9 @@ else:
 try:
  if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
-  _import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
+  _import_structure["utils.dummy_flax_objects"] = [
+      name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)
+  ]
 else:
  _import_structure["models.flan_t5"].extend(["FlaxFlanT5"])
  _import_structure["models.opt"].extend(["FlaxOPT"])
@@ -164,7 +180,9 @@ else:
 try:
  if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
 except exceptions.MissingDependencyError:
-  _import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
+  _import_structure["utils.dummy_tf_objects"] = [
+      name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)
+  ]
 else:
  _import_structure["models.flan_t5"].extend(["TFFlanT5"])
  _import_structure["models.opt"].extend(["TFOPT"])
@@ -175,7 +193,10 @@ else:
    from .models.opt import TFOPT as TFOPT

 # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
-__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED})
+__lazy = openllm_core.utils.LazyModule(__name__,
+                                       globals()["__file__"],
+                                       _import_structure,
+                                       extra_objects={"COMPILED": COMPILED})
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/_embeddings.py
+++ b/openllm-python/src/openllm/_embeddings.py
@@ -26,22 +26,24 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model:
  except bentoml.exceptions.NotFound:
    model_signatures = {
        k: ModelSignature(batchable=False)
-        for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__')
+        for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
+                  'group_beam_search', 'constrained_beam_search', '__call__')
    }
-    with bentoml.models.create(
-        ids,
-        module=MODULE_NAME,
-        api_version=API_VERSION,
-        options=ModelOptions(),
-        context=openllm.utils.generate_context(framework_name='transformers'),
-        labels={
-            'runtime': 'pt', 'framework': 'openllm'
-        },
-        signatures=model_signatures
-    ) as bentomodel:
+    with bentoml.models.create(ids,
+                               module=MODULE_NAME,
+                               api_version=API_VERSION,
+                               options=ModelOptions(),
+                               context=openllm.utils.generate_context(framework_name='transformers'),
+                               labels={
+                                   'runtime': 'pt',
+                                   'framework': 'openllm'
+                               },
+                               signatures=model_signatures) as bentomodel:
      snapshot_download(
-          _GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt']
-      )
+          _GENERIC_EMBEDDING_ID,
+          local_dir=bentomodel.path,
+          local_dir_use_symlinks=False,
+          ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt'])
      return bentomodel

 class GenericEmbeddingRunnable(bentoml.Runnable):
@@ -66,7 +68,10 @@ class GenericEmbeddingRunnable(bentoml.Runnable):
      model_output = self.model(**encoded_input)
    # Perform pooling and normalize
    sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1)
-    return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))]
+    return [
+        openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(),
+                              num_tokens=int(torch.sum(attention_mask).item()))
+    ]

  @staticmethod
  def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -14,23 +14,30 @@ LogitsProcessorList = transformers.LogitsProcessorList
 StoppingCriteriaList = transformers.StoppingCriteriaList

 class StopSequenceCriteria(transformers.StoppingCriteria):
-  def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
+
+  def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer |
+               transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
    if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
    self.stop_sequences, self.tokenizer = stop_sequences, tokenizer

  def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool:
-    return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)
+    return any(
+        self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences)

 class StopOnTokens(transformers.StoppingCriteria):
+
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool:
    return input_ids[0][-1] in {50278, 50279, 50277, 1, 0}

 def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList:
  generation_config = config.generation_config
  logits_processor = transformers.LogitsProcessorList()
-  if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0: logits_processor.append(transformers.TemperatureLogitsWarper(generation_config['temperature']))
-  if generation_config['repetition_penalty'] > 1.0: logits_processor.append(transformers.RepetitionPenaltyLogitsProcessor(generation_config['repetition_penalty']))
-  if 1e-8 <= generation_config['top_p']: logits_processor.append(transformers.TopPLogitsWarper(generation_config['top_p']))
+  if generation_config['temperature'] >= 1e-5 and generation_config['temperature'] != 1.0:
+    logits_processor.append(transformers.TemperatureLogitsWarper(generation_config['temperature']))
+  if generation_config['repetition_penalty'] > 1.0:
+    logits_processor.append(transformers.RepetitionPenaltyLogitsProcessor(generation_config['repetition_penalty']))
+  if 1e-8 <= generation_config['top_p']:
+    logits_processor.append(transformers.TopPLogitsWarper(generation_config['top_p']))
  if generation_config['top_k'] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config['top_k']))
  return logits_processor

--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -15,21 +15,27 @@ if t.TYPE_CHECKING:

  from ._llm import LLM

-autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
+autogptq, torch, transformers = LazyLoader('autogptq', globals(),
+                                           'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader(
+                                               'transformers', globals(), 'transformers')

 logger = logging.getLogger(__name__)

 QuantiseMode = t.Literal['int8', 'int4', 'gptq']

@overload
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'],
+                              **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
  ...

@overload
-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'],
+                              **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
  ...

-def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
+def infer_quantisation_config(
+    cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode,
+    **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
  # 8 bit configuration
  int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
  int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -50,13 +56,12 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
    if 'lm_head' not in int8_skip_modules and cls.config_class.__openllm_model_type__ == 'causal_lm':
      logger.debug("Skipping 'lm_head' for quantization for %s", cls.__name__)
      int8_skip_modules.append('lm_head')
-    return transformers.BitsAndBytesConfig(
-        load_in_8bit=True,
-        llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload,
-        llm_int8_threshhold=int8_threshold,
-        llm_int8_skip_modules=int8_skip_modules,
-        llm_int8_has_fp16_weight=int8_has_fp16_weight,
-    )
+    return transformers.BitsAndBytesConfig(load_in_8bit=True,
+                                           llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload,
+                                           llm_int8_threshhold=int8_threshold,
+                                           llm_int8_skip_modules=int8_skip_modules,
+                                           llm_int8_has_fp16_weight=int8_has_fp16_weight,
+                                          )

  # 4 bit configuration
  int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16)
@@ -66,18 +71,21 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
  # NOTE: Quantization setup
  # quantize is a openllm.LLM feature, where we can quantize the model
  # with bitsandbytes or quantization aware training.
-  if not is_bitsandbytes_available(): raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
+  if not is_bitsandbytes_available():
+    raise RuntimeError(
+        "Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'"
+    )
  if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
  elif quantise == 'int4':
    if is_transformers_supports_kbit():
-      quantisation_config = transformers.BitsAndBytesConfig(
-          load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant
-      )
+      quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
+                                                            bnb_4bit_compute_dtype=int4_compute_dtype,
+                                                            bnb_4bit_quant_type=int4_quant_type,
+                                                            bnb_4bit_use_double_quant=int4_use_double_quant)
    else:
      logger.warning(
          "'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
-          pkg.pkg_version_info('transformers')
-      )
+          pkg.pkg_version_info('transformers'))
      quantisation_config = create_int8_config(int8_skip_modules)
  elif quantise == 'gptq':
    if not is_autogptq_available():
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -21,11 +21,14 @@ if t.TYPE_CHECKING:
  from bentoml._internal.runner.runner import AbstractRunner
  from bentoml._internal.runner.runner import RunnerMethod
  from openllm_core._typing_compat import TypeAlias
-  _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]
+  _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]],
+                                             [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]]

 # The following warnings from bitsandbytes, and probably not that important for users to see
-warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
-warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
+warnings.filterwarnings('ignore',
+                        message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
+warnings.filterwarnings('ignore',
+                        message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
 warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')

 model = os.environ.get('OPENLLM_MODEL', '{__model_name__}')  # openllm: model name
@@ -37,15 +40,23 @@ generic_embedding_runner = bentoml.Runner(
    name='llm-generic-embedding',
    scheduling_strategy=openllm_core.CascadingResourceStrategy,
    max_batch_size=32,
-    max_latency_ms=300
-)
+    max_latency_ms=300)
 runners: list[AbstractRunner] = [runner]
 if not runner.supports_embeddings: runners.append(generic_embedding_runner)
 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)

-_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None})
+_JsonInput = bentoml.io.JSON.from_sample({
+    'prompt': '',
+    'llm_config': llm_config.model_dump(flatten=True),
+    'adapter_name': None
+})

-@svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)}))
+@svc.api(route='/v1/generate',
+         input=_JsonInput,
+         output=bentoml.io.JSON.from_sample({
+             'responses': [],
+             'configuration': llm_config.model_dump(flatten=True)
+         }))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  config = qa_inputs.llm_config.model_dump()
@@ -56,67 +67,45 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  echo = input_dict.pop('echo', False)
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
-  return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump())
+  return runner.generate_iterator.async_stream(qa_inputs.prompt,
+                                               adapter_name=qa_inputs.adapter_name,
+                                               echo=echo,
+                                               **qa_inputs.llm_config.model_dump())

-@svc.api(
-    route='/v1/metadata',
-    input=bentoml.io.Text(),
-    output=bentoml.io.JSON.from_sample({
-        'model_id': runner.llm.model_id,
-        'timeout': 3600,
-        'model_name': llm_config['model_name'],
-        'framework': runner.llm_framework,
-        'configuration': '',
-        'supports_embeddings': runner.supports_embeddings,
-        'supports_hf_agent': runner.supports_hf_agent
-    })
-)
+@svc.api(route='/v1/metadata',
+         input=bentoml.io.Text(),
+         output=bentoml.io.JSON.from_sample({
+             'model_id': runner.llm.model_id,
+             'timeout': 3600,
+             'model_name': llm_config['model_name'],
+             'framework': runner.llm_framework,
+             'configuration': '',
+             'supports_embeddings': runner.supports_embeddings,
+             'supports_hf_agent': runner.supports_hf_agent
+         }))
 def metadata_v1(_: str) -> openllm.MetadataOutput:
-  return openllm.MetadataOutput(
-      timeout=llm_config['timeout'],
-      model_name=llm_config['model_name'],
-      framework=llm_config['env']['framework_value'],
-      model_id=runner.llm.model_id,
-      configuration=llm_config.model_dump_json().decode(),
-      supports_embeddings=runner.supports_embeddings,
-      supports_hf_agent=runner.supports_hf_agent
-  )
+  return openllm.MetadataOutput(timeout=llm_config['timeout'],
+                                model_name=llm_config['model_name'],
+                                framework=llm_config['env']['framework_value'],
+                                model_id=runner.llm.model_id,
+                                configuration=llm_config.model_dump_json().decode(),
+                                supports_embeddings=runner.supports_embeddings,
+                                supports_hf_agent=runner.supports_hf_agent)

-@svc.api(
-    route='/v1/embeddings',
-    input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
-    output=bentoml.io.JSON.from_sample({
-        'embeddings': [
-            0.007917795330286026,
-            -0.014421648345887661,
-            0.00481307040899992,
-            0.007331526838243008,
-            -0.0066398633643984795,
-            0.00945580005645752,
-            0.0087016262114048,
-            -0.010709521360695362,
-            0.012635177001357079,
-            0.010541186667978764,
-            -0.00730888033285737,
-            -0.001783102168701589,
-            0.02339819073677063,
-            -0.010825827717781067,
-            -0.015888236463069916,
-            0.01876218430697918,
-            0.0076906150206923485,
-            0.0009032754460349679,
-            -0.010024012066423893,
-            0.01090280432254076,
-            -0.008668390102684498,
-            0.02070549875497818,
-            0.0014594447566196322,
-            -0.018775740638375282,
-            -0.014814382418990135,
-            0.01796768605709076
-        ],
-        'num_tokens': 20
-    })
-)
+@svc.api(route='/v1/embeddings',
+         input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']),
+         output=bentoml.io.JSON.from_sample({
+             'embeddings': [
+                 0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008,
+                 -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362,
+                 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589,
+                 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918,
+                 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076,
+                 -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282,
+                 -0.014814382418990135, 0.01796768605709076
+             ],
+             'num_tokens': 20
+         }))
 async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
  embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode  # type: ignore[type-arg,assignment,valid-type]
  responses = (await embed_call.async_run(phrases))[0]
@@ -132,7 +121,8 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
      raise openllm.exceptions.OpenLLMException(f'Invalid JSON input received: {err}') from None
    stop = input_data.parameters.pop('stop', ['\n'])
    try:
-      return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200)
+      return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters),
+                          status_code=200)
    except NotImplementedError:
      return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500)

@@ -142,7 +132,8 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
 # general metadata app
 async def list_adapter_v1(_: Request) -> Response:
  res: dict[str, t.Any] = {}
-  if runner.peft_adapters['success'] is True: res['result'] = {k: v.to_dict() for k, v in runner.peft_adapters['result'].items()}
+  if runner.peft_adapters['success'] is True:
+    res['result'] = {k: v.to_dict() for k, v in runner.peft_adapters['result'].items()}
  res.update({'success': runner.peft_adapters['success'], 'error_msg': runner.peft_adapters['error_msg']})
  return JSONResponse(res, status_code=200)

--- a/openllm-python/src/openllm/bundle/init.py
+++ b/openllm-python/src/openllm/bundle/init.py
@@ -10,7 +10,10 @@ from openllm_core.utils import LazyModule

 _import_structure: dict[str, list[str]] = {
    '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'],
-    'oci': ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
+    'oci': [
+        'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name',
+        'supported_registries', 'RefResolver'
+    ]
 }

 if t.TYPE_CHECKING:
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -43,14 +43,18 @@ logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'

-def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
+def build_editable(path: str,
+                   package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None:
  '''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
  if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None
  # We need to build the package in editable mode, so that we can import it
  from build import ProjectBuilder
  from build.env import IsolatedEnvBuilder
  module_location = openllm_core.utils.pkg.source_locations(package)
-  if not module_location: raise RuntimeError('Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.')
+  if not module_location:
+    raise RuntimeError(
+        'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.'
+    )
  pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
  if os.path.isfile(pyproject_path.__fspath__()):
    logger.info('Generating built wheels for package %s...', package)
@@ -60,9 +64,14 @@ def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'ope
      builder.scripts_dir = env.scripts_dir
      env.install(builder.build_system_requires)
      return builder.build('wheel', path, config_settings={'--global-option': '--quiet'})
-  raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')
+  raise RuntimeError(
+      'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.')

-def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions:
+def construct_python_options(llm: openllm.LLM[t.Any, t.Any],
+                             llm_fs: FS,
+                             extra_dependencies: tuple[str, ...] | None = None,
+                             adapter_map: dict[str, str | None] | None = None,
+                            ) -> PythonOptions:
  packages = ['openllm', 'scipy']  # apparently bnb misses this one
  if adapter_map is not None: packages += ['openllm[fine-tune]']
  # NOTE: add openllm to the default dependencies
@@ -73,27 +82,24 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d

  req = llm.config['requirements']
  if req is not None: packages.extend(req)
-  if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false': packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
+  if str(os.environ.get('BENTOML_BUNDLE_LOCAL_BUILD', False)).lower() == 'false':
+    packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")

  env = llm.config['env']
  framework_envvar = env['framework_value']
  if framework_envvar == 'flax':
-    if not openllm_core.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
-    packages.extend([importlib.metadata.version('flax'), importlib.metadata.version('jax'), importlib.metadata.version('jaxlib')])
+    if not openllm_core.utils.is_flax_available():
+      raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
+    packages.extend(
+        [importlib.metadata.version('flax'),
+         importlib.metadata.version('jax'),
+         importlib.metadata.version('jaxlib')])
  elif framework_envvar == 'tf':
-    if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
-    candidates = (
-        'tensorflow',
-        'tensorflow-cpu',
-        'tensorflow-gpu',
-        'tf-nightly',
-        'tf-nightly-cpu',
-        'tf-nightly-gpu',
-        'intel-tensorflow',
-        'intel-tensorflow-avx512',
-        'tensorflow-rocm',
-        'tensorflow-macos',
-    )
+    if not openllm_core.utils.is_tf_available():
+      raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
+    candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu',
+                  'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos',
+                 )
    # For the metadata, we have to look for both tensorflow and tensorflow-cpu
    for candidate in candidates:
      try:
@@ -106,28 +112,28 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
      except importlib.metadata.PackageNotFoundError:
        pass  # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
  else:
-    if not openllm_core.utils.is_torch_available(): raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
+    if not openllm_core.utils.is_torch_available():
+      raise ValueError('PyTorch is not available. Make sure to have it locally installed.')
    packages.extend([f'torch>={importlib.metadata.version("torch")}'])
  wheels: list[str] = []
  built_wheels: list[str | None] = [
-      build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p)) for p in ('openllm_core', 'openllm_client', 'openllm')
+      build_editable(llm_fs.getsyspath('/'), t.cast(t.Literal['openllm', 'openllm_core', 'openllm_client'], p))
+      for p in ('openllm_core', 'openllm_client', 'openllm')
  ]
-  if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
-  return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=['https://download.pytorch.org/whl/cu118'])
+  if all(i for i in built_wheels):
+    wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
+  return PythonOptions(packages=packages,
+                       wheels=wheels,
+                       lock_packages=False,
+                       extra_index_url=['https://download.pytorch.org/whl/cu118'])

-def construct_docker_options(
-    llm: openllm.LLM[t.Any, t.Any],
-    _: FS,
-    workers_per_resource: float,
-    quantize: LiteralString | None,
-    bettertransformer: bool | None,
-    adapter_map: dict[str, str | None] | None,
-    dockerfile_template: str | None,
-    runtime: t.Literal['ggml', 'transformers'],
-    serialisation_format: t.Literal['safetensors', 'legacy'],
-    container_registry: LiteralContainerRegistry,
-    container_version_strategy: LiteralContainerVersionStrategy
-) -> DockerOptions:
+def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float,
+                             quantize: LiteralString | None, bettertransformer: bool | None,
+                             adapter_map: dict[str, str | None] | None, dockerfile_template: str | None,
+                             runtime: t.Literal['ggml', 'transformers'], serialisation_format: t.Literal['safetensors',
+                                                                                                         'legacy'],
+                             container_registry: LiteralContainerRegistry,
+                             container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
  from openllm.cli._factory import parse_config_options
  environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
  env: openllm_core.utils.EnvVarMixin = llm.config['env']
@@ -146,12 +152,18 @@ def construct_docker_options(
  if adapter_map: env_dict['BITSANDBYTES_NOWELCOME'] = os.environ.get('BITSANDBYTES_NOWELCOME', '1')

  # We need to handle None separately here, as env from subprocess doesn't accept None value.
-  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
+  _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'],
+                                        bettertransformer=bettertransformer,
+                                        quantize=quantize,
+                                        runtime=runtime)

  env_dict[_env.bettertransformer] = str(_env['bettertransformer_value'])
  if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value'])
  env_dict[_env.runtime] = _env['runtime_value']
-  return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', env=env_dict, dockerfile_template=dockerfile_template)
+  return DockerOptions(
+      base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}',
+      env=env_dict,
+      dockerfile_template=dockerfile_template)

 OPENLLM_MODEL_NAME = '# openllm: model name'
 OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map'
@@ -185,47 +197,58 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
 def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
  from openllm_core.utils import DEBUG
  model_name = llm.config['model_name']
-  logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/'))
+  logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'],
+               llm_fs.getsyspath('/'))
  with open(_service_file.__fspath__(), 'r') as f:
    src_contents = f.readlines()
  for it in src_contents:
-    if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
+    if OPENLLM_MODEL_NAME in it:
+      src_contents[src_contents.index(it)] = (
+          ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n')
    elif OPENLLM_MODEL_ADAPTER_MAP in it:
-      src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
+      src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(
+          orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n')
  script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents)
  if DEBUG: logger.info('Generated script:\n%s', script)
  llm_fs.writetext(llm.config['service_name'], script)

@inject
-def create_bento(
-    bento_tag: bentoml.Tag,
-    llm_fs: FS,
-    llm: openllm.LLM[t.Any, t.Any],
-    workers_per_resource: str | float,
-    quantize: LiteralString | None,
-    bettertransformer: bool | None,
-    dockerfile_template: str | None,
-    adapter_map: dict[str, str | None] | None = None,
-    extra_dependencies: tuple[str, ...] | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
-    container_registry: LiteralContainerRegistry = 'ecr',
-    container_version_strategy: LiteralContainerVersionStrategy = 'release',
-    _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
-    _model_store: ModelStore = Provide[BentoMLContainer.model_store]
-) -> bentoml.Bento:
+def create_bento(bento_tag: bentoml.Tag,
+                 llm_fs: FS,
+                 llm: openllm.LLM[t.Any, t.Any],
+                 workers_per_resource: str | float,
+                 quantize: LiteralString | None,
+                 bettertransformer: bool | None,
+                 dockerfile_template: str | None,
+                 adapter_map: dict[str, str | None] | None = None,
+                 extra_dependencies: tuple[str, ...] | None = None,
+                 runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+                 serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+                 container_registry: LiteralContainerRegistry = 'ecr',
+                 container_version_strategy: LiteralContainerVersionStrategy = 'release',
+                 _bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
+                 _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento:
  framework_envvar = llm.config['env']['framework_value']
  labels = dict(llm.identifying_params)
-  labels.update({'_type': llm.llm_type, '_framework': framework_envvar, 'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle'})
+  labels.update({
+      '_type': llm.llm_type,
+      '_framework': framework_envvar,
+      'start_name': llm.config['start_name'],
+      'base_name_or_path': llm.model_id,
+      'bundler': 'openllm.bundle'
+  })
  if adapter_map: labels.update(adapter_map)
  if isinstance(workers_per_resource, str):
    if workers_per_resource == 'round_robin': workers_per_resource = 1.0
-    elif workers_per_resource == 'conserved': workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
+    elif workers_per_resource == 'conserved':
+      workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 /
+                                                                                      openllm_core.utils.device_count())
    else:
      try:
        workers_per_resource = float(workers_per_resource)
      except ValueError:
-        raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
+        raise ValueError(
+            "'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
  elif isinstance(workers_per_resource, int):
    workers_per_resource = float(workers_per_resource)
  logger.info("Building Bento for '%s'", llm.config['start_name'])
@@ -233,19 +256,18 @@ def create_bento(
  write_service(llm, adapter_map, llm_fs)

  llm_spec = ModelSpec.from_item({'tag': str(llm.tag), 'alias': llm.tag.name})
-  build_config = BentoBuildConfig(
-      service=f"{llm.config['service_name']}:svc",
-      name=bento_tag.name,
-      labels=labels,
-      description=f"OpenLLM service for {llm.config['start_name']}",
-      include=list(llm_fs.walk.files()),
-      exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
-      python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
-      models=[llm_spec],
-      docker=construct_docker_options(
-          llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy
-      )
-  )
+  build_config = BentoBuildConfig(service=f"{llm.config['service_name']}:svc",
+                                  name=bento_tag.name,
+                                  labels=labels,
+                                  description=f"OpenLLM service for {llm.config['start_name']}",
+                                  include=list(llm_fs.walk.files()),
+                                  exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
+                                  python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
+                                  models=[llm_spec],
+                                  docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize,
+                                                                  bettertransformer, adapter_map, dockerfile_template,
+                                                                  runtime, serialisation_format, container_registry,
+                                                                  container_version_strategy))

  bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/'))
  # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM.
@@ -261,6 +283,7 @@ def create_bento(
  if openllm_core.utils.DEBUG: logger.info('Generated script:\n%s', script)

  bento._fs.writetext(service_fs_path, script)
-  if 'model_store' in inspect.signature(bento.save).parameters: return bento.save(bento_store=_bento_store, model_store=_model_store)
+  if 'model_store' in inspect.signature(bento.save).parameters:
+    return bento.save(bento_store=_bento_store, model_store=_model_store)
  # backward arguments. `model_store` is added recently
  return bento.save(bento_store=_bento_store)
--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -42,7 +42,11 @@ ROOT_DIR = pathlib.Path(os.path.abspath('__file__')).parent.parent.parent
 # but in the future, we can infer based on git repo and everything to make it more options for users
 # to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
 # NOTE: The ECR registry is the public one and currently only @bentoml team has access to push it.
-_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {'docker': 'docker.io/bentoml/openllm', 'gh': 'ghcr.io/bentoml/openllm', 'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'}
+_CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {
+    'docker': 'docker.io/bentoml/openllm',
+    'gh': 'ghcr.io/bentoml/openllm',
+    'ecr': 'public.ecr.aws/y5w8i4y6/bentoml/openllm'
+}

 # TODO: support custom fork. Currently it only support openllm main.
 _OWNER = 'bentoml'
@@ -64,7 +68,8 @@ def _commit_time_range(r: int = 5) -> str:
 class VersionNotSupported(openllm.exceptions.OpenLLMException):
  """Raised when the stable release is too low that it doesn't include OpenLLM base container."""

-_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy'])
+_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple',
+                                                                             ['git_hash', 'version', 'strategy'])

 def nightly_resolver(cls: type[RefResolver]) -> str:
  # NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -78,7 +83,11 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
    commits = t.cast('list[dict[str, t.Any]]', cls._ghapi.repos.list_commits(since=_commit_time_range()))
    return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message'])
  # now is the correct behaviour
-  return orjson.loads(subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2]
+  return orjson.loads(
+      subprocess.check_output([
+          docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags',
+          'docker://ghcr.io/bentoml/openllm'
+      ]).decode().strip())['Tags'][-2]

@attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
@@ -98,16 +107,20 @@ class RefResolver:
      # NOTE: This strategy will only support openllm>0.2.12
      meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release()
      version_str = meta['name'].lstrip('v')
-      version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
+      version: tuple[str,
+                     str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str)
    else:
      version = ('', version_str)
    if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12):
-      raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
+      raise VersionNotSupported(
+          f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'"
+      )
    return _RefTuple((*version, 'release' if _use_base_strategy else 'custom'))

  @classmethod
  @functools.lru_cache(maxsize=64)
-  def from_strategy(cls, strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
+  def from_strategy(cls,
+                    strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver:
    # using default strategy
    if strategy_or_version is None or strategy_or_version == 'release': return cls(*cls._release_ref())
    elif strategy_or_version == 'latest': return cls('latest', '0.0.0', 'latest')
@@ -115,7 +128,8 @@ class RefResolver:
      _ref = cls._nightly_ref()
      return cls(_ref[0], '0.0.0', _ref[-1])
    else:
-      logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', strategy_or_version)
+      logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.',
+                     strategy_or_version)
      return cls(*cls._release_ref(version_str=strategy_or_version))

  @property
@@ -129,21 +143,27 @@ class RefResolver:
 def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str:
  return RefResolver.from_strategy(strategy).tag

-def build_container(
-    registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
-    version_strategy: LiteralContainerVersionStrategy = 'release',
-    push: bool = False,
-    machine: bool = False
-) -> dict[str | LiteralContainerRegistry, str]:
+def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None,
+                    version_strategy: LiteralContainerVersionStrategy = 'release',
+                    push: bool = False,
+                    machine: bool = False) -> dict[str | LiteralContainerRegistry, str]:
  try:
    if not _BUILDER.health(): raise openllm.exceptions.Error
  except (openllm.exceptions.Error, subprocess.CalledProcessError):
-    raise RuntimeError('Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.') from None
-  if openllm_core.utils.device_count() == 0: raise RuntimeError('Building base container requires GPUs (None available)')
-  if not shutil.which('nvidia-container-runtime'): raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
-  if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
+    raise RuntimeError(
+        'Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.'
+    ) from None
+  if openllm_core.utils.device_count() == 0:
+    raise RuntimeError('Building base container requires GPUs (None available)')
+  if not shutil.which('nvidia-container-runtime'):
+    raise RuntimeError('NVIDIA Container Toolkit is required to compile CUDA kernel in container.')
+  if not _module_location:
+    raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
  pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml'
-  if not pyproject_path.exists(): raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'")
+  if not pyproject_path.exists():
+    raise ValueError(
+        "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'"
+    )
  if not registries:
    tags: dict[str | LiteralContainerRegistry, str] = {
        alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items()
@@ -152,24 +172,27 @@ def build_container(
    registries = [registries] if isinstance(registries, str) else list(registries)
    tags = {name: f'{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}' for name in registries}
  try:
-    outputs = _BUILDER.build(
-        file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
-        context_path=pyproject_path.parent.__fspath__(),
-        tag=tuple(tags.values()),
-        push=push,
-        progress='plain' if openllm_core.utils.get_debug_mode() else 'auto',
-        quiet=machine
-    )
+    outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath('Dockerfile').resolve().__fspath__(),
+                             context_path=pyproject_path.parent.__fspath__(),
+                             tag=tuple(tags.values()),
+                             push=push,
+                             progress='plain' if openllm_core.utils.get_debug_mode() else 'auto',
+                             quiet=machine)
    if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip()
  except Exception as err:
-    raise openllm.exceptions.OpenLLMException(f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err
+    raise openllm.exceptions.OpenLLMException(
+        f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}'
+    ) from err
  return tags

 if t.TYPE_CHECKING:
  CONTAINER_NAMES: dict[LiteralContainerRegistry, str]
  supported_registries: list[str]

-__all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver']
+__all__ = [
+    'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries',
+    'RefResolver'
+]

 def __dir__() -> list[str]:
  return sorted(__all__)
--- a/openllm-python/src/openllm/cli/_factory.py
+++ b/openllm-python/src/openllm/cli/_factory.py
@@ -40,27 +40,46 @@ _AnyCallable = t.Callable[..., t.Any]
 FC = t.TypeVar('FC', bound=t.Union[_AnyCallable, click.Command])

 def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(str(it.tag), help='Bento') for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})]
+  return [
+      sc.CompletionItem(str(it.tag), help='Bento')
+      for it in bentoml.list()
+      if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {'start_name', 'bundler'})
+  ]

 def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
-  return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
+  return [
+      sc.CompletionItem(inflection.dasherize(it), help='Model')
+      for it in openllm.CONFIG_MAPPING
+      if it.startswith(incomplete)
+  ]

-def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
+def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float,
+                         device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
  # TODO: Support amd.com/gpu on k8s
  _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '')
  _bentoml_config_options_opts = [
-      'tracing.sample_rate=1.0',
-      f'api_server.traffic.timeout={server_timeout}',
+      'tracing.sample_rate=1.0', f'api_server.traffic.timeout={server_timeout}',
      f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}',
      f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'
  ]
  if device:
-    if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)])
-    else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
-  _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
+    if len(device) > 1:
+      _bentoml_config_options_opts.extend([
+          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}'
+          for idx, dev in enumerate(device)
+      ])
+    else:
+      _bentoml_config_options_opts.append(
+          f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]')
+  _bentoml_config_options_opts.append(
+      f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}')
  if cors:
-    _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
-    _bentoml_config_options_opts.extend([f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])])
+    _bentoml_config_options_opts.extend(
+        ['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"'])
+    _bentoml_config_options_opts.extend([
+        f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"'
+        for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])
+    ])
  _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts)
  environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env
  if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env)
@@ -82,7 +101,10 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...
    ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None
  return None

-def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command:
+def start_command_factory(group: click.Group,
+                          model: str,
+                          _context_settings: DictStrAny | None = None,
+                          _serve_grpc: bool = False) -> click.Command:
  llm_config = openllm.AutoConfig.for_model(model)
  command_attrs: DictStrAny = dict(
      name=llm_config['model_name'],
@@ -113,37 +135,29 @@ Available official model_id(s): [default: {llm_config['default_id']}]
  if llm_config['requires_gpu'] and openllm.utils.device_count() < 1:
    # NOTE: The model requires GPU, therefore we will return a dummy command
    command_attrs.update({
-        'short_help': '(Disabled because there is no GPU available)', 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
+        'short_help':
+            '(Disabled because there is no GPU available)',
+        'help':
+            f'{model} is currently not available to run on your local machine because it requires GPU for inference.'
    })
    return noop_command(group, llm_config, _serve_grpc, **command_attrs)

  @group.command(**command_attrs)
  @start_decorator(llm_config, serve_grpc=_serve_grpc)
  @click.pass_context
-  def start_cmd(
-      ctx: click.Context,
-      /,
-      server_timeout: int,
-      model_id: str | None,
-      model_version: str | None,
-      workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
-      device: t.Tuple[str, ...],
-      quantize: t.Literal['int8', 'int4', 'gptq'] | None,
-      bettertransformer: bool | None,
-      runtime: t.Literal['ggml', 'transformers'],
-      fast: bool,
-      serialisation_format: t.Literal['safetensors', 'legacy'],
-      cors: bool,
-      adapter_id: str | None,
-      return_process: bool,
-      **attrs: t.Any,
-  ) -> LLMConfig | subprocess.Popen[bytes]:
+  def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None,
+                workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...],
+                quantize: t.Literal['int8', 'int4', 'gptq'] | None, bettertransformer: bool | None,
+                runtime: t.Literal['ggml', 'transformers'], fast: bool, serialisation_format: t.Literal['safetensors',
+                                                                                                        'legacy'],
+                cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any,
+               ) -> LLMConfig | subprocess.Popen[bytes]:
    fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES
-    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get('OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
+    if serialisation_format == 'safetensors' and quantize is not None and os.environ.get(
+        'OPENLLM_SERIALIZATION_WARNING', str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES:
      termui.echo(
          f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
-          fg='yellow'
-      )
+          fg='yellow')
    adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None)
    config, server_attrs = llm_config.model_validate_click(**attrs)
    server_timeout = openllm.utils.first_not_none(server_timeout, default=config['timeout'])
@@ -169,16 +183,21 @@ Available official model_id(s): [default: {llm_config['default_id']}]
      wpr = float(wpr)

    # Create a new model env to work with the envvar during CLI invocation
-    env = openllm.utils.EnvVarMixin(
-        config['model_name'], config.default_implementation(), model_id=model_id or config['default_id'], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime
-    )
+    env = openllm.utils.EnvVarMixin(config['model_name'],
+                                    config.default_implementation(),
+                                    model_id=model_id or config['default_id'],
+                                    bettertransformer=bettertransformer,
+                                    quantize=quantize,
+                                    runtime=runtime)
    prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr))

    # NOTE: This is to set current configuration
    start_env = os.environ.copy()
    start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env)
    if fast:
-      termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg='yellow')
+      termui.echo(
+          f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'",
+          fg='yellow')

    start_env.update({
        'OPENLLM_MODEL': model,
@@ -194,18 +213,28 @@ Available official model_id(s): [default: {llm_config['default_id']}]
    if bettertransformer is not None: start_env[env.bettertransformer] = str(env['bettertransformer_value'])
    if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))

-    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(
-        model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format
-    )
+    llm = openllm.utils.infer_auto_class(env['framework_value']).for_model(model,
+                                                                           model_id=start_env[env.model_id],
+                                                                           model_version=model_version,
+                                                                           llm_config=config,
+                                                                           ensure_available=not fast,
+                                                                           adapter_map=adapter_map,
+                                                                           serialisation=serialisation_format)
    start_env.update({env.config: llm.config.model_dump_json().decode()})

-    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
+    server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer(
+        '_service:svc', **server_attrs)
    openllm.utils.analytics.track_start_init(llm.config)

    def next_step(model_name: str, adapter_map: DictStrAny | None) -> None:
      cmd_name = f'openllm build {model_name}'
-      if adapter_map is not None: cmd_name += ' ' + ' '.join([f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]])
-      if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')
+      if adapter_map is not None:
+        cmd_name += ' ' + ' '.join([
+            f'--adapter-id {s}'
+            for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]
+        ])
+      if not openllm.utils.get_quiet_mode():
+        termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue')

    if return_process:
      server.start(env=start_env, text=True)
@@ -239,30 +268,35 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, *

  return noop

-def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
-  if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
+def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None,
+                       adapter_map: dict[str, str | None] | None, num_workers: int) -> None:
+  if adapter_map and not openllm.utils.is_peft_available():
+    ctx.fail(
+        "Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'")
  if quantize and llm_config.default_implementation() == 'vllm':
-    ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.")
+    ctx.fail(
+        f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization."
+    )
  requirements = llm_config['requirements']
  if requirements is not None and len(requirements) > 0:
    missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None]
-    if len(missing_requirements) > 0: termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')
+    if len(missing_requirements) > 0:
+      termui.echo(f'Make sure to have the following dependencies available: {missing_requirements}', fg='yellow')

 def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]:
+
  def wrapper(fn: FC) -> t.Callable[[FC], FC]:
    composed = openllm.utils.compose(
-        llm_config.to_click_options,
-        _http_server_args if not serve_grpc else _grpc_server_args,
-        cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
-        model_id_option(factory=cog.optgroup, model_env=llm_config['env']),
-        model_version_option(factory=cog.optgroup),
-        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
-        workers_per_resource_option(factory=cog.optgroup),
-        cors_option(factory=cog.optgroup),
-        fast_option(factory=cog.optgroup),
+        llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args,
        cog.optgroup.group(
-            'LLM Optimization Options',
-            help='''Optimization related options.
+            'General LLM Options',
+            help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."),
+        model_id_option(factory=cog.optgroup, model_env=llm_config['env']), model_version_option(factory=cog.optgroup),
+        cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'),
+        workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup),
+        fast_option(factory=cog.optgroup),
+        cog.optgroup.group('LLM Optimization Options',
+                           help='''Optimization related options.

            OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/),
            k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.
@@ -272,23 +306,23 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
            ''',
-        ),
-        cog.optgroup.option(
-            '--device',
-            type=openllm.utils.dantic.CUDA,
-            multiple=True,
-            envvar='CUDA_VISIBLE_DEVICES',
-            callback=parse_device_callback,
-            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
-            show_envvar=True
-        ),
-        cog.optgroup.option('--runtime', type=click.Choice(['ggml', 'transformers']), default='transformers', help='The runtime to use for the given model. Default is transformers.'),
+                          ),
+        cog.optgroup.option('--device',
+                            type=openllm.utils.dantic.CUDA,
+                            multiple=True,
+                            envvar='CUDA_VISIBLE_DEVICES',
+                            callback=parse_device_callback,
+                            help=f"Assign GPU devices (if available) for {llm_config['model_name']}.",
+                            show_envvar=True),
+        cog.optgroup.option('--runtime',
+                            type=click.Choice(['ggml', 'transformers']),
+                            default='transformers',
+                            help='The runtime to use for the given model. Default is transformers.'),
        quantize_option(factory=cog.optgroup, model_env=llm_config['env']),
        bettertransformer_option(factory=cog.optgroup, model_env=llm_config['env']),
        serialisation_option(factory=cog.optgroup),
-        cog.optgroup.group(
-            'Fine-tuning related options',
-            help='''\
+        cog.optgroup.group('Fine-tuning related options',
+                           help='''\
    Note that the argument `--adapter-id` can accept the following format:

    - `--adapter-id /path/to/adapter` (local adapter)
@@ -302,23 +336,22 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
    $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora

    ```
-    '''
-        ),
-        cog.optgroup.option(
-            '--adapter-id',
-            default=None,
-            help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'",
-            multiple=True,
-            callback=_id_callback,
-            metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'
-        ),
+    '''),
+        cog.optgroup.option('--adapter-id',
+                            default=None,
+                            help='Optional name or path for given LoRA adapter' +
+                            f" to wrap '{llm_config['model_name']}'",
+                            multiple=True,
+                            callback=_id_callback,
+                            metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'),
        click.option('--return-process', is_flag=True, default=False, help='Internal use only.', hidden=True),
    )
    return composed(fn)

  return wrapper

-def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
+def parse_device_callback(ctx: click.Context, param: click.Parameter,
+                          value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None:
  if value is None: return value
  if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})')
  el: t.Tuple[str, ...] = tuple(i for k in value for i in k)
@@ -337,14 +370,18 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]

  command = 'serve' if not serve_grpc else 'serve-grpc'
  group = cog.optgroup.group(
-      f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
+      f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options",
+      help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",
  )

  def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]:
    serve_command = cli.commands[command]
    # The first variable is the argument bento
    # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS
-    serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS]
+    serve_options = [
+        p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS]
+        if p.name not in _IGNORED_OPTIONS
+    ]
    for options in reversed(serve_options):
      attrs = options.to_info_dict()
      # we don't need param_type_name, since it should all be options
@@ -381,73 +418,90 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC |
 cli_option = functools.partial(_click_factory_type, attr='option')
 cli_argument = functools.partial(_click_factory_type, attr='argument')

-def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]:
+def output_option(f: _AnyCallable | None = None,
+                  *,
+                  default_value: LiteralOutput = 'pretty',
+                  **attrs: t.Any) -> t.Callable[[FC], FC]:
  output = ['json', 'pretty', 'porcelain']

  def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
    return [CompletionItem(it) for it in output]

-  return cli_option(
-      '-o',
-      '--output',
-      'output',
-      type=click.Choice(output),
-      default=default_value,
-      help='Showing output type.',
-      show_default=True,
-      envvar='OPENLLM_OUTPUT',
-      show_envvar=True,
-      shell_complete=complete_output_var,
-      **attrs
-  )(f)
+  return cli_option('-o',
+                    '--output',
+                    'output',
+                    type=click.Choice(output),
+                    default=default_value,
+                    help='Showing output type.',
+                    show_default=True,
+                    envvar='OPENLLM_OUTPUT',
+                    show_envvar=True,
+                    shell_complete=complete_output_var,
+                    **attrs)(f)

 def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--fast/--no-fast',
-      show_default=True,
-      default=False,
-      envvar='OPENLLM_USE_LOCAL_LATEST',
-      show_envvar=True,
-      help='''Whether to skip checking if models is already in store.
+  return cli_option('--fast/--no-fast',
+                    show_default=True,
+                    default=False,
+                    envvar='OPENLLM_USE_LOCAL_LATEST',
+                    show_envvar=True,
+                    help='''Whether to skip checking if models is already in store.

                                                                                                          This is useful if you already downloaded or setup the model beforehand.
                                                                                                          ''',
-      **attrs
-  )(f)
+                    **attrs)(f)

 def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--cors/--no-cors', show_default=True, default=False, envvar='OPENLLM_CORS', show_envvar=True, help='Enable CORS for the server.', **attrs)(f)
+  return cli_option('--cors/--no-cors',
+                    show_default=True,
+                    default=False,
+                    envvar='OPENLLM_CORS',
+                    show_envvar=True,
+                    help='Enable CORS for the server.',
+                    **attrs)(f)

 def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option('--machine', is_flag=True, default=False, hidden=True, **attrs)(f)

-def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--model-id',
-      type=click.STRING,
-      default=None,
-      envvar=model_env.model_id if model_env is not None else None,
-      show_envvar=model_env is not None,
-      help='Optional model_id name or path for (fine-tune) weight.',
-      **attrs
-  )(f)
+def model_id_option(f: _AnyCallable | None = None,
+                    *,
+                    model_env: openllm.utils.EnvVarMixin | None = None,
+                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option('--model-id',
+                    type=click.STRING,
+                    default=None,
+                    envvar=model_env.model_id if model_env is not None else None,
+                    show_envvar=model_env is not None,
+                    help='Optional model_id name or path for (fine-tune) weight.',
+                    **attrs)(f)

 def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option('--model-version', type=click.STRING, default=None, help='Optional model version to save for this model. It will be inferred automatically from model-id.', **attrs)(f)
+  return cli_option(
+      '--model-version',
+      type=click.STRING,
+      default=None,
+      help='Optional model version to save for this model. It will be inferred automatically from model-id.',
+      **attrs)(f)

 def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
+  return cli_argument('model_name',
+                      type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]),
+                      required=required,
+                      **attrs)(f)

-def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--quantise',
-      '--quantize',
-      'quantize',
-      type=click.Choice(['int8', 'int4', 'gptq']),
-      default=None,
-      envvar=model_env.quantize if model_env is not None else None,
-      show_envvar=model_env is not None,
-      help='''Dynamic quantization for running this LLM.
+def quantize_option(f: _AnyCallable | None = None,
+                    *,
+                    build: bool = False,
+                    model_env: openllm.utils.EnvVarMixin | None = None,
+                    **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option('--quantise',
+                    '--quantize',
+                    'quantize',
+                    type=click.Choice(['int8', 'int4', 'gptq']),
+                    default=None,
+                    envvar=model_env.quantize if model_env is not None else None,
+                    show_envvar=model_env is not None,
+                    help='''Dynamic quantization for running this LLM.

      The following quantization strategies are supported:

@@ -461,17 +515,18 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model
      ''' + ('''
      > [!NOTE] that this will set the mode for serving within deployment.''' if build else '') + '''
      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
-      **attrs
-  )(f)
+                    **attrs)(f)

-def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--workers-per-resource',
-      default=None,
-      callback=workers_per_resource_callback,
-      type=str,
-      required=False,
-      help='''Number of workers per resource assigned.
+def workers_per_resource_option(f: _AnyCallable | None = None,
+                                *,
+                                build: bool = False,
+                                **attrs: t.Any) -> t.Callable[[FC], FC]:
+  return cli_option('--workers-per-resource',
+                    default=None,
+                    callback=workers_per_resource_callback,
+                    type=str,
+                    required=False,
+                    help='''Number of workers per resource assigned.

      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
      for more information. By default, this is set to 1.
@@ -481,38 +536,37 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

      - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      ''' + (
-          """\n
+      ''' + ("""\n
      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
      > be provisioned in Kubernetes as well as in standalone container. This will
-      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''
-      ),
-      **attrs
-  )(f)
+      > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ''),
+                    **attrs)(f)

-def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
+def bettertransformer_option(f: _AnyCallable | None = None,
+                             *,
+                             build: bool = False,
+                             model_env: openllm.utils.EnvVarMixin | None = None,
+                             **attrs: t.Any) -> t.Callable[[FC], FC]:
  return cli_option(
      '--bettertransformer',
      is_flag=True,
      default=None,
      envvar=model_env.bettertransformer if model_env is not None else None,
      show_envvar=model_env is not None,
-      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.'
-      if not build else 'Set default environment variable whether to serve this model with FasterTransformer in build time.',
-      **attrs
-  )(f)
+      help='Apply FasterTransformer wrapper to serve model. This will applies during serving time.' if not build else
+      'Set default environment variable whether to serve this model with FasterTransformer in build time.',
+      **attrs)(f)

 def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--serialisation',
-      '--serialization',
-      'serialisation_format',
-      type=click.Choice(['safetensors', 'legacy']),
-      default='safetensors',
-      show_default=True,
-      show_envvar=True,
-      envvar='OPENLLM_SERIALIZATION',
-      help='''Serialisation format for save/load LLM.
+  return cli_option('--serialisation',
+                    '--serialization',
+                    'serialisation_format',
+                    type=click.Choice(['safetensors', 'legacy']),
+                    default='safetensors',
+                    show_default=True,
+                    show_envvar=True,
+                    envvar='OPENLLM_SERIALIZATION',
+                    help='''Serialisation format for save/load LLM.

      Currently the following strategies are supported:

@@ -529,28 +583,25 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal

      > [!NOTE] that GGML format is working in progress.
      ''',
-      **attrs
-  )(f)
+                    **attrs)(f)

 def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
-  return cli_option(
-      '--container-registry',
-      'container_registry',
-      type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
-      default='ecr',
-      show_default=True,
-      show_envvar=True,
-      envvar='OPENLLM_CONTAINER_REGISTRY',
-      callback=container_registry_callback,
-      help='''The default container registry to get the base image for building BentoLLM.
+  return cli_option('--container-registry',
+                    'container_registry',
+                    type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)),
+                    default='ecr',
+                    show_default=True,
+                    show_envvar=True,
+                    envvar='OPENLLM_CONTAINER_REGISTRY',
+                    callback=container_registry_callback,
+                    help='''The default container registry to get the base image for building BentoLLM.

      Currently, it supports 'ecr', 'ghcr.io', 'docker.io'

      \b
      > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information.
      ''',
-      **attrs
-  )(f)
+                    **attrs)(f)

 _wpr_strategies = {'round_robin', 'conserved'}

@@ -562,11 +613,14 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va
    try:
      float(value)  # type: ignore[arg-type]
    except ValueError:
-      raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None
+      raise click.BadParameter(
+          f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.",
+          ctx, param) from None
    else:
      return value

 def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None:
  if value is None: return value
-  if value not in openllm.bundle.supported_registries: raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
+  if value not in openllm.bundle.supported_registries:
+    raise click.BadParameter(f'Value must be one of {openllm.bundle.supported_registries}', ctx, param)
  return value
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -30,25 +30,23 @@ if t.TYPE_CHECKING:

 logger = logging.getLogger(__name__)

-def _start(
-    model_name: str,
-    /,
-    *,
-    model_id: str | None = None,
-    timeout: int = 30,
-    workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
-    device: tuple[str, ...] | t.Literal['all'] | None = None,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-    bettertransformer: bool | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    adapter_map: dict[LiteralString, str | None] | None = None,
-    framework: LiteralRuntime | None = None,
-    additional_args: list[str] | None = None,
-    cors: bool = False,
-    _serve_grpc: bool = False,
-    __test__: bool = False,
-    **_: t.Any
-) -> LLMConfig | subprocess.Popen[bytes]:
+def _start(model_name: str,
+           /,
+           *,
+           model_id: str | None = None,
+           timeout: int = 30,
+           workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
+           device: tuple[str, ...] | t.Literal['all'] | None = None,
+           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           bettertransformer: bool | None = None,
+           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+           adapter_map: dict[LiteralString, str | None] | None = None,
+           framework: LiteralRuntime | None = None,
+           additional_args: list[str] | None = None,
+           cors: bool = False,
+           _serve_grpc: bool = False,
+           __test__: bool = False,
+           **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]:
  """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments.

  For all additional arguments, pass it as string to ``additional_args``. For example, if you want to
@@ -91,58 +89,66 @@ def _start(
  from .entrypoint import start_command
  from .entrypoint import start_grpc_command
  llm_config = openllm.AutoConfig.for_model(model_name)
-  _ModelEnv = openllm_core.utils.EnvVarMixin(
-      model_name,
-      openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()),
-      model_id=model_id,
-      bettertransformer=bettertransformer,
-      quantize=quantize,
-      runtime=runtime
-  )
+  _ModelEnv = openllm_core.utils.EnvVarMixin(model_name,
+                                             openllm_core.utils.first_not_none(
+                                                 framework, default=llm_config.default_implementation()),
+                                             model_id=model_id,
+                                             bettertransformer=bettertransformer,
+                                             quantize=quantize,
+                                             runtime=runtime)
  os.environ[_ModelEnv.framework] = _ModelEnv['framework_value']

  args: list[str] = ['--runtime', runtime]
  if model_id: args.extend(['--model-id', model_id])
  if timeout: args.extend(['--server-timeout', str(timeout)])
-  if workers_per_resource: args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource])
+  if workers_per_resource:
+    args.extend([
+        '--workers-per-resource',
+        str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource
+    ])
  if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)])
-  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
+  if quantize and bettertransformer:
+    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(['--quantize', str(quantize)])
  elif bettertransformer: args.append('--bettertransformer')
  if cors: args.append('--cors')
-  if adapter_map: args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])))
+  if adapter_map:
+    args.extend(
+        list(
+            itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()
+                                          ])))
  if additional_args: args.extend(additional_args)
  if __test__: args.append('--return-process')

-  return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(
-      args=args if len(args) > 0 else None, standalone_mode=False
-  )
+  return start_command_factory(start_command if not _serve_grpc else start_grpc_command,
+                               model_name,
+                               _context_settings=termui.CONTEXT_SETTINGS,
+                               _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None,
+                                                             standalone_mode=False)

@inject
-def _build(
-    model_name: str,
-    /,
-    *,
-    model_id: str | None = None,
-    model_version: str | None = None,
-    bento_version: str | None = None,
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-    bettertransformer: bool | None = None,
-    adapter_map: dict[str, str | None] | None = None,
-    build_ctx: str | None = None,
-    enable_features: tuple[str, ...] | None = None,
-    workers_per_resource: float | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    dockerfile_template: str | None = None,
-    overwrite: bool = False,
-    container_registry: LiteralContainerRegistry | None = None,
-    container_version_strategy: LiteralContainerVersionStrategy | None = None,
-    push: bool = False,
-    containerize: bool = False,
-    serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
-    additional_args: list[str] | None = None,
-    bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
-) -> bentoml.Bento:
+def _build(model_name: str,
+           /,
+           *,
+           model_id: str | None = None,
+           model_version: str | None = None,
+           bento_version: str | None = None,
+           quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+           bettertransformer: bool | None = None,
+           adapter_map: dict[str, str | None] | None = None,
+           build_ctx: str | None = None,
+           enable_features: tuple[str, ...] | None = None,
+           workers_per_resource: float | None = None,
+           runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+           dockerfile_template: str | None = None,
+           overwrite: bool = False,
+           container_registry: LiteralContainerRegistry | None = None,
+           container_version_strategy: LiteralContainerVersionStrategy | None = None,
+           push: bool = False,
+           containerize: bool = False,
+           serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
+           additional_args: list[str] | None = None,
+           bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
  """Package a LLM into a Bento.

  The LLM will be built into a BentoService with the following structure:
@@ -192,8 +198,12 @@ def _build(
  Returns:
      ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
-  args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation', serialisation_format]
-  if quantize and bettertransformer: raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
+  args: list[str] = [
+      sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--runtime', runtime, '--serialisation',
+      serialisation_format
+  ]
+  if quantize and bettertransformer:
+    raise OpenLLMException("'quantize' and 'bettertransformer' are currently mutually exclusive.")
  if quantize: args.extend(['--quantize', quantize])
  if bettertransformer: args.append('--bettertransformer')
  if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
@@ -221,21 +231,21 @@ def _build(
    raise OpenLLMException(str(e)) from None
  matched = re.match(r'__tag__:([^:\n]+:[^:\n]+)$', output.decode('utf-8').strip())
  if matched is None:
-    raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.")
+    raise ValueError(
+        f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
+    )
  return bentoml.get(matched.group(1), _bento_store=bento_store)

-def _import_model(
-    model_name: str,
-    /,
-    *,
-    model_id: str | None = None,
-    model_version: str | None = None,
-    runtime: t.Literal['ggml', 'transformers'] = 'transformers',
-    implementation: LiteralRuntime = 'pt',
-    quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
-    serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
-    additional_args: t.Sequence[str] | None = None
-) -> bentoml.Model:
+def _import_model(model_name: str,
+                  /,
+                  *,
+                  model_id: str | None = None,
+                  model_version: str | None = None,
+                  runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+                  implementation: LiteralRuntime = 'pt',
+                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+                  serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
+                  additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
  """Import a LLM into local store.

  > [!NOTE]
@@ -267,7 +277,10 @@ def _import_model(
      ``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
  """
  from .entrypoint import import_command
-  args = [model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation', serialisation_format,]
+  args = [
+      model_name, '--runtime', runtime, '--implementation', implementation, '--machine', '--serialisation',
+      serialisation_format,
+  ]
  if model_id is not None: args.append(model_id)
  if model_version is not None: args.extend(['--model-version', str(model_version)])
  if additional_args is not None: args.extend(additional_args)
@@ -278,5 +291,9 @@ def _list_models() -> dict[str, t.Any]:
  '''List all available models within the local store.'''
  from .entrypoint import models_command
  return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False)
-start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
+
+start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(
+    _start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(
+        _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(
+            _import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
 __all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models']
--- a/openllm-python/src/openllm/cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm/cli/extension/build_base_container.py
@@ -14,10 +14,9 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralContainerRegistry
  from openllm_core._typing_compat import LiteralContainerVersionStrategy

-@click.command(
-    'build_base_container',
-    context_settings=termui.CONTEXT_SETTINGS,
-    help='''Base image builder for BentoLLM.
+@click.command('build_base_container',
+               context_settings=termui.CONTEXT_SETTINGS,
+               help='''Base image builder for BentoLLM.

                By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
                Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -27,13 +26,16 @@ if t.TYPE_CHECKING:
                This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.

                Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
-                '''
-)
+                ''')
@container_registry_option
-@click.option('--version-strategy', type=click.Choice(['release', 'latest', 'nightly']), default='nightly', help='Version strategy to use for tagging the image.')
+@click.option('--version-strategy',
+              type=click.Choice(['release', 'latest', 'nightly']),
+              default='nightly',
+              help='Version strategy to use for tagging the image.')
@click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False)
@machine_option
-def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
+def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None,
+        version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
  mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
  if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return mapping
--- a/openllm-python/src/openllm/cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py
@@ -24,14 +24,19 @@ if t.TYPE_CHECKING:
@machine_option
@click.pass_context
@inject
-def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
+def cli(ctx: click.Context,
+        bento: str,
+        machine: bool,
+        _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
  '''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
  try:
    bentomodel = _bento_store.get(bento)
  except bentoml.exceptions.NotFound:
    ctx.fail(f'Bento {bento} not found. Make sure to call `openllm build` first.')
  if 'bundler' not in bentomodel.info.labels or bentomodel.info.labels['bundler'] != 'openllm.bundle':
-    ctx.fail(f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness.")
+    ctx.fail(
+        f"Bento is either too old or not built with OpenLLM. Make sure to use ``openllm build {bentomodel.info.labels['start_name']}`` for correctness."
+    )
  if machine: return bentomodel.path
  # copy and paste this into a new shell
  if psutil.WINDOWS: subprocess.check_call([shutil.which('dir') or 'dir'], cwd=bentomodel.path)
--- a/openllm-python/src/openllm/cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py
@@ -19,7 +19,9 @@ from openllm_core.utils import bentoml_cattr
 if t.TYPE_CHECKING:
  from bentoml._internal.bento import BentoStore

-@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
+@click.command('get_containerfile',
+               context_settings=termui.CONTEXT_SETTINGS,
+               help='Return Containerfile of any given Bento.')
@click.argument('bento', type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
@@ -39,7 +41,13 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento
    # NOTE: if users specify a dockerfile_template, we will
    # save it to /env/docker/Dockerfile.template. This is necessary
    # for the reconstruction of the Dockerfile.
-    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None: docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
-    doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True)
+    if 'dockerfile_template' in docker_attrs and docker_attrs['dockerfile_template'] is not None:
+      docker_attrs['dockerfile_template'] = 'env/docker/Dockerfile.template'
+    doc = generate_containerfile(docker=DockerOptions(**docker_attrs),
+                                 build_ctx=bentomodel.path,
+                                 conda=options.conda,
+                                 bento_fs=bentomodel._fs,
+                                 enable_buildkit=True,
+                                 add_header=True)
    termui.echo(doc, fg='white')
  return bentomodel.path
--- a/openllm-python/src/openllm/cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm/cli/extension/get_prompt.py
@@ -18,41 +18,51 @@ from openllm_core._prompt import process_prompt
 LiteralOutput = t.Literal['json', 'pretty', 'porcelain']

@click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
-@click.argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
+@click.argument('model_name',
+                type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]),
+                shell_complete=model_complete_envvar)
@click.argument('prompt', type=click.STRING)
@output_option
@click.option('--format', type=click.STRING, default=None)
@machine_option
-@click.option(
-    '--opt',
-    help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
-    required=False,
-    multiple=True,
-    callback=opt_callback,
-    metavar='ARG=VALUE[,ARG=VALUE]'
-)
+@click.option('--opt',
+              help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)",
+              required=False,
+              multiple=True,
+              callback=opt_callback,
+              metavar='ARG=VALUE[,ARG=VALUE]')
@click.pass_context
-def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
+def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool,
+        _memoized: dict[str, t.Any], **_: t.Any) -> str | None:
  '''Get the default prompt used by OpenLLM.'''
  module = openllm.utils.EnvVarMixin(model_name).module
  _memoized = {k: v[0] for k, v in _memoized.items() if v}
  try:
    template = getattr(module, 'DEFAULT_PROMPT_TEMPLATE', None)
    prompt_mapping = getattr(module, 'PROMPT_MAPPING', None)
-    if template is None: raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
+    if template is None:
+      raise click.BadArgumentUsage(f'model {model_name} does not have a default prompt template') from None
    if callable(template):
      if format is None:
-        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None: raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
-        raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
-      if prompt_mapping is None: raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
-      if format not in prompt_mapping: raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
+        if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None:
+          raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.')
+        raise click.BadOptionUsage(
+            'format',
+            f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})")
+      if prompt_mapping is None:
+        raise click.BadArgumentUsage(
+            f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None
+      if format not in prompt_mapping:
+        raise click.BadOptionUsage(
+            'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})')
      _prompt_template = template(format)
    else:
      _prompt_template = template
    fully_formatted = process_prompt(prompt, _prompt_template, True, **_memoized)
    if machine: return repr(fully_formatted)
    elif output == 'porcelain': termui.echo(repr(fully_formatted), fg='white')
-    elif output == 'json': termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
+    elif output == 'json':
+      termui.echo(orjson.dumps({'prompt': fully_formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
    else:
      termui.echo(f'== Prompt for {model_name} ==\n', fg='magenta')
      termui.echo(fully_formatted, fg='white')
--- a/openllm-python/src/openllm/cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm/cli/extension/list_bentos.py
@@ -19,23 +19,27 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None:
  '''List available bentos built by OpenLLM.'''
  mapping = {
      k: [{
-          'tag': str(b.tag),
-          'size': human_readable_size(openllm.utils.calc_dir_size(b.path)),
+          'tag':
+              str(b.tag),
+          'size':
+              human_readable_size(openllm.utils.calc_dir_size(b.path)),
          'models': [{
-              'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
+              'tag': str(m.tag),
+              'size': human_readable_size(openllm.utils.calc_dir_size(m.path))
          } for m in (bentoml.models.get(_.tag) for _ in b.info.models)]
-      } for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k] for k in tuple(
-          inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()
-      )
+      } for b in tuple(i for i in bentoml.list() if all(
+          k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k
+         ] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  }
  mapping = {k: v for k, v in mapping.items() if v}
  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(
-        tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size', 'Models']),
-        fg='white'
-    )
+    termui.echo(tabulate.tabulate(
+        [(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v],
+        tablefmt='fancy_grid',
+        headers=['LLM', 'Tag', 'Size', 'Models']),
+                fg='white')
  else:
    termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white')
  ctx.exit(0)
--- a/openllm-python/src/openllm/cli/extension/list_models.py
+++ b/openllm-python/src/openllm/cli/extension/list_models.py
@@ -25,17 +25,33 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
  '''This is equivalent to openllm models --show-available less the nice table.'''
  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  ids_in_local_store = {
-      k: [i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
-          ] for k in models
+      k: [
+          i for i in bentoml.models.list() if 'framework' in i.info.labels and
+          i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k
+      ] for k in models
  }
  if model_name is not None:
-    ids_in_local_store = {k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()}
+    ids_in_local_store = {
+        k: [
+            i
+            for i in v
+            if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
+        ] for k, v in ids_in_local_store.items()
+    }
  ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
-  local_models = {k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()}
+  local_models = {
+      k: [{
+          'tag': str(i.tag),
+          'size': human_readable_size(openllm.utils.calc_dir_size(i.path))
+      } for i in val] for k, val in ids_in_local_store.items()
+  }
  if output == 'pretty':
    import tabulate
    tabulate.PRESERVE_WHITESPACE = True
-    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v], tablefmt='fancy_grid', headers=['LLM', 'Tag', 'Size']), fg='white')
+    termui.echo(tabulate.tabulate([(k, i['tag'], i['size']) for k, v in local_models.items() for i in v],
+                                  tablefmt='fancy_grid',
+                                  headers=['LLM', 'Tag', 'Size']),
+                fg='white')
  else:
    termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
  return local_models
--- a/openllm-python/src/openllm/cli/extension/playground.py
+++ b/openllm-python/src/openllm/cli/extension/playground.py
@@ -28,12 +28,18 @@ logger = logging.getLogger(__name__)
 def load_notebook_metadata() -> DictStrAny:
  with open(os.path.join(os.path.dirname(playground.__file__), '_meta.yml'), 'r') as f:
    content = yaml.safe_load(f)
-  if not all('description' in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
+  if not all('description' in k for k in content.values()):
+    raise ValueError("Invalid metadata file. All entries must have a 'description' key.")
  return content

@click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
@click.argument('output-dir', default=None, required=False)
-@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
+@click.option('--port',
+              envvar='JUPYTER_PORT',
+              show_envvar=True,
+              show_default=True,
+              default=8888,
+              help='Default port for Jupyter server')
@click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  """OpenLLM Playground.
@@ -54,7 +60,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
  """
  if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
+    raise RuntimeError(
+        "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
+    )
  metadata = load_notebook_metadata()
  _temp_dir = False
  if output_dir is None:
@@ -66,7 +74,8 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
  termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
  for module in pkgutil.iter_modules(playground.__path__):
    if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
+      logger.debug('Skipping: %s (%s)', module.name,
+                   'File already exists' if not module.ispkg else f'{module.name} is a module')
      continue
    if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue
    termui.echo('Generating notebook for: ' + module.name, fg='magenta')
@@ -75,7 +84,10 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
    f.cells.insert(0, markdown_cell)
    jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook')
  try:
-    subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug'])
+    subprocess.check_output([
+        sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port',
+        str(port), '--no-browser', '--debug'
+    ])
  except subprocess.CalledProcessError as e:
    termui.echo(e.output, fg='red')
    raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -12,8 +12,13 @@ if t.TYPE_CHECKING:

 def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.Any) -> None:
  attrs['fg'] = fg if not openllm.utils.get_debug_mode() else None
-  if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)
+  if not openllm.utils.get_quiet_mode():
+    t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs)

 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
-CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore}
+CONTEXT_SETTINGS: DictStrAny = {
+    'help_option_names': ['-h', '--help'],
+    'max_content_width': COLUMNS,
+    'token_normalize_func': inflection.underscore
+}
 __all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS']
--- a/openllm-python/src/openllm/models/auto/init.py
+++ b/openllm-python/src/openllm/models/auto/init.py
@@ -43,7 +43,8 @@ except openllm.exceptions.MissingDependencyError:
  pass
 else:
  _import_structure['modeling_flax_auto'].extend(['AutoFlaxLLM', 'MODEL_FLAX_MAPPING'])
-  if t.TYPE_CHECKING: from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
+  if t.TYPE_CHECKING:
+    from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
 try:
  if not is_tf_available(): raise openllm.exceptions.MissingDependencyError
 except openllm.exceptions.MissingDependencyError:
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -30,10 +30,18 @@ class BaseAutoLLMClass:
  _model_mapping: t.ClassVar[_LazyAutoMapping]

  def __init__(self, *args: t.Any, **attrs: t.Any):
-    raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")
+    raise EnvironmentError(
+        f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead."
+    )

  @classmethod
-  def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False,
+  def for_model(cls,
+                model: str,
+                /,
+                model_id: str | None = None,
+                model_version: str | None = None,
+                llm_config: openllm.LLMConfig | None = None,
+                ensure_available: bool = False,
                **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
    '''The lower level API for creating a LLM instance.

@@ -42,7 +50,10 @@ class BaseAutoLLMClass:
    >>> llm = openllm.AutoLLM.for_model("flan-t5")
    ```
    '''
-    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
+    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id,
+                                                           model_version=model_version,
+                                                           llm_config=llm_config,
+                                                           **attrs)
    if ensure_available: llm.ensure_model_id_exists()
    return llm

@@ -105,7 +116,9 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
  This OrderedDict values() and keys() returns the list instead, so you don't
  have to do list(mapping.values()) to get the list of values.
  """
-  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
+
+  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString],
+               model_mapping: OrderedDict[LiteralString, LiteralString]):
    self._config_mapping = config_mapping
    self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
    self._model_mapping = model_mapping
@@ -115,7 +128,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
  def __getitem__(self, key: type[openllm.LLMConfig]) -> type[openllm.LLM[t.Any, t.Any]]:
    if key in self._extra_content: return self._extra_content[key]
    model_type = self._reverse_config_mapping[key.__name__]
-    if model_type in self._model_mapping: return self._load_attr_from_module(model_type, self._model_mapping[model_type])
+    if model_type in self._model_mapping:
+      return self._load_attr_from_module(model_type, self._model_mapping[model_type])
    # Maybe there was several model types associated with this config.
    model_types = [k for k, v in self._config_mapping.items() if v == key.__name__]
    for mtype in model_types:
@@ -124,7 +138,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):

  def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
    module_name = inflection.underscore(model_type)
-    if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
+    if module_name not in self._modules:
+      self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
    return getattribute_from_module(self._modules[module_name], attr)

  def __len__(self) -> int:
@@ -138,29 +153,32 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
    return ReprMixin.__repr__(self)

  def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
-    yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)
+    yield from ((key, (value, self._model_mapping[key]))
+                for key, value in self._config_mapping.items()
+                if key in self._model_mapping)

  def __bool__(self) -> bool:
    return bool(self.keys())

  def keys(self) -> ConfigModelKeysView:
-    return t.cast(
-        'ConfigModelKeysView', [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())
-    )
+    return t.cast('ConfigModelKeysView', [
+        self._load_attr_from_module(key, name)
+        for key, name in self._config_mapping.items()
+        if key in self._model_mapping.keys()
+    ] + list(self._extra_content.keys()))

  def values(self) -> ConfigModelValuesView:
-    return t.cast(
-        'ConfigModelValuesView', [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(
-            self._extra_content.values()
-        )
-    )
+    return t.cast('ConfigModelValuesView', [
+        self._load_attr_from_module(key, name)
+        for key, name in self._model_mapping.items()
+        if key in self._config_mapping.keys()
+    ] + list(self._extra_content.values()))

  def items(self) -> ConfigModelItemsView:
-    return t.cast(
-        'ConfigModelItemsView',
-        [(self._load_attr_from_module(key, self._config_mapping[key]),
-          self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())
-    )
+    return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(
+        key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
+                                           for key in self._model_mapping.keys()
+                                           if key in self._config_mapping.keys()] + list(self._extra_content.items()))

  def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
    return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
@@ -172,7 +190,8 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):

  def register(self, key: t.Any, value: t.Any) -> None:
    if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping:
-      if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.")
+      if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys():
+        raise ValueError(f"'{key}' is already used by a OpenLLM model.")
    self._extra_content[key] = value

 __all__ = ['BaseAutoLLMClass', '_LazyAutoMapping']
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -7,9 +7,10 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass
 from .factory import _LazyAutoMapping

-MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), (
-    'opt', 'OPT'
-), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
+MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'),
+                                   ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
+                                   ('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'),
+                                   ('baichuan', 'Baichuan')])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)

 class AutoLLM(BaseAutoLLMClass):
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -7,9 +7,10 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass
 from .factory import _LazyAutoMapping

-MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), (
-    'opt', 'VLLMOPT'
-), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
+MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'),
+                                        ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
+                                        ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
+                                        ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)

 class AutoVLLM(BaseAutoLLMClass):
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -11,5 +11,6 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai
    import torch
    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      outputs = self.model.generate(**inputs,
+                                    generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -14,7 +14,9 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
      self.model.eval()
      # Only use half precision if the model is not yet quantized
      if self.config.use_half_precision: self.model.half()
-      return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      return self.model.chat(self.tokenizer,
+                             prompt,
+                             generation_config=self.config.model_construct_env(**attrs).to_generation_config())

  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
    import torch
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -10,29 +10,57 @@ from openllm_core.config.configuration_dolly_v2 import END_KEY
 from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
 from openllm_core.config.configuration_dolly_v2 import get_special_token_id
 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
-else:  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
+else:
+  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader(
+      'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
 logger = logging.getLogger(__name__)

@overload
-def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
+def get_pipeline(model: transformers.PreTrainedModel,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 _init: t.Literal[True] = True,
+                 **attrs: t.Any) -> transformers.Pipeline:
  ...

@overload
-def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]:
+def get_pipeline(model: transformers.PreTrainedModel,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 _init: t.Literal[False] = ...,
+                 **attrs: t.Any) -> type[transformers.Pipeline]:
  ...

-def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
+def get_pipeline(model: transformers.PreTrainedModel,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 _init: bool = False,
+                 **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
  # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
  class InstructionTextGenerationPipeline(transformers.Pipeline):
-    def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
-      super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

-    def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
+    def __init__(self,
+                 *args: t.Any,
+                 do_sample: bool = True,
+                 max_new_tokens: int = 256,
+                 top_p: float = 0.92,
+                 top_k: int = 0,
+                 **kwargs: t.Any):
+      super().__init__(*args,
+                       model=model,
+                       tokenizer=tokenizer,
+                       do_sample=do_sample,
+                       max_new_tokens=max_new_tokens,
+                       top_p=top_p,
+                       top_k=top_k,
+                       **kwargs)
+
+    def _sanitize_parameters(self,
+                             return_full_text: bool | None = None,
+                             **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
      preprocess_params: dict[str, t.Any] = {}
      # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
      # append a newline to yield a single token.  find whatever token is configured for the response key.
-      tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
+      tokenizer_response_key = next(
+          (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
      response_key_token_id = None
      end_key_token_id = None
      if tokenizer_response_key:
@@ -56,7 +84,8 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
      inputs['instruction_text'] = input_
      return t.cast(t.Dict[str, t.Any], inputs)

-    def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
+    def _forward(self, input_tensors: dict[str, t.Any],
+                 **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
      input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
      if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
@@ -65,15 +94,20 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
          input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
          attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
          pad_token_id=self.tokenizer.pad_token_id,
-          **generate_kwargs
-      )
+          **generate_kwargs)
      out_b = generated_sequence.shape[0]
-      if self.framework == 'pt': generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
-      elif self.framework == 'tf': generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+      if self.framework == 'pt':
+        generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+      elif self.framework == 'tf':
+        generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
      instruction_text = input_tensors.pop('instruction_text')
      return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}

-    def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
+    def postprocess(self,
+                    model_outputs: dict[str, t.Any],
+                    response_key_token_id: int,
+                    end_key_token_id: int,
+                    return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
      _generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
      generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
@@ -89,7 +123,8 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
            response_pos = sequence.index(response_key_token_id)
          except ValueError:
            response_pos = None
-          if response_pos is None: logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
+          if response_pos is None:
+            logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
          if response_pos:
            # Next find where "### End" is located.  The model has been trained to end its responses with this
            # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
@@ -127,12 +162,20 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}
+    return {
+        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+        'torch_dtype': torch.bfloat16
+    }, {}

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
-    return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
+    return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
+                        self.tokenizer,
+                        _init=True,
+                        return_full_text=self.config.return_full_text)

  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
    llm_config = self.config.model_construct_env(**attrs)
    with torch.inference_mode():
-      return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
+      return self.model(prompt,
+                        return_full_text=llm_config.return_full_text,
+                        generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -3,32 +3,43 @@ import typing as t

 import openllm
 if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
+else:
+  torch, transformers = openllm.utils.LazyLoader('torch', globals(),
+                                                 'torch'), openllm.utils.LazyLoader('transformers', globals(),
+                                                                                    'transformers')

 class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
+    return {
+        'torch_dtype': torch.bfloat16,
+        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None
+    }, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    eos_token_id, inputs = attrs.pop('eos_token_id',
+                                     self.tokenizer.eos_token_id), self.tokenizer(prompt,
+                                                                                  return_tensors='pt').to(self.device)
    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      return self.tokenizer.batch_decode(
-          self.model.generate(
-              input_ids=inputs['input_ids'],
-              attention_mask=inputs['attention_mask'],
-              generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()
-          ),
-          skip_special_tokens=True
-      )
+      return self.tokenizer.batch_decode(self.model.generate(
+          input_ids=inputs['input_ids'],
+          attention_mask=inputs['attention_mask'],
+          generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
+                                         skip_special_tokens=True)

-  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str],
+                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
+        prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
+        'stopping_criteria', openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
-    result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    result = self.tokenizer.decode(
+        self.model.generate(encoded_inputs['input_ids'],
+                            max_new_tokens=max_new_tokens,
+                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -12,9 +12,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
    import torch
    with torch.inference_mode():
      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-          skip_special_tokens=True
-      )
+          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                              do_sample=True,
+                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+          skip_special_tokens=True)

  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
    import torch
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -9,18 +9,16 @@ if t.TYPE_CHECKING: import transformers
 class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
  __openllm_internal__ = True

-  def sanitize_parameters(
-      self,
-      prompt: str,
-      max_new_tokens: int | None = None,
-      temperature: float | None = None,
-      top_k: int | None = None,
-      top_p: float | None = None,
-      repetition_penalty: float | None = None,
-      decoder_start_token_id: int | None = None,
-      use_default_prompt_template: bool = True,
-      **attrs: t.Any
-  ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+  def sanitize_parameters(self,
+                          prompt: str,
+                          max_new_tokens: int | None = None,
+                          temperature: float | None = None,
+                          top_k: int | None = None,
+                          top_p: float | None = None,
+                          repetition_penalty: float | None = None,
+                          decoder_start_token_id: int | None = None,
+                          use_default_prompt_template: bool = True,
+                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    if decoder_start_token_id is None: decoder_start_token_id = 0
    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
        'max_new_tokens': max_new_tokens,
@@ -34,13 +32,10 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
    decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
-    return self.tokenizer.batch_decode(
-        self.model.generate(
-            self.tokenizer(prompt, return_tensors='np')['input_ids'],
-            do_sample=True,
-            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            decoder_start_token_id=decoder_start_token_id
-        ).sequences,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=True
-    )
+    return self.tokenizer.batch_decode(self.model.generate(
+        self.tokenizer(prompt, return_tensors='np')['input_ids'],
+        do_sample=True,
+        generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+        decoder_start_token_id=decoder_start_token_id).sequences,
+                                       skip_special_tokens=True,
+                                       clean_up_tokenization_spaces=True)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -8,7 +8,8 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(
-        self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-        skip_special_tokens=True
-    )
+    return self.tokenizer.batch_decode(self.model.generate(
+        self.tokenizer(prompt, return_tensors='tf').input_ids,
+        do_sample=True,
+        generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -25,11 +25,8 @@ class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNe
    import torch
    with torch.inference_mode():
      return self.tokenizer.batch_decode(
-          self.model.generate(
-              self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
-              do_sample=True,
-              generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-              pad_token_id=self.tokenizer.eos_token_id,
-              stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
-          )
-      )
+          self.model.generate(self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
+                              do_sample=True,
+                              generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+                              pad_token_id=self.tokenizer.eos_token_id,
+                              stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])))
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -23,13 +23,20 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
      mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
      masked_embeddings = data * mask
      sum_embeddings, seq_length = torch.sum(masked_embeddings, dim=1), torch.sum(mask, dim=1)
-    return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item()))
+    return openllm.LLMEmbeddings(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
+                                 num_tokens=int(torch.sum(attention_mask).item()))

-  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str],
+                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
+        prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
+        'stopping_criteria', openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
-    result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    result = self.tokenizer.decode(
+        self.model.generate(encoded_inputs['input_ids'],
+                            max_new_tokens=max_new_tokens,
+                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -12,12 +12,15 @@ if t.TYPE_CHECKING:

 logger = logging.getLogger(__name__)

-def get_mpt_config(
-    model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True
-) -> transformers.PretrainedConfig:
+def get_mpt_config(model_id_or_path: str,
+                   max_sequence_length: int,
+                   device: torch.device | str | int | None,
+                   device_map: str | None = None,
+                   trust_remote_code: bool = True) -> transformers.PretrainedConfig:
  import torch
  config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-  if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
+  if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)):
+    config.init_device = str(device)
  if hasattr(config, 'attn_config') and is_triton_available(): config.attn_config['attn_impl'] = 'triton'
  else:
    logger.debug(
@@ -37,7 +40,10 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
+    return {
+        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+        'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    }, {}

  def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
    import torch
@@ -46,12 +52,24 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
    torch_dtype = attrs.pop('torch_dtype', self.dtype)
    device_map = attrs.pop('device_map', None)
    attrs.pop('low_cpu_mem_usage', None)
-    config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
+    config = get_mpt_config(self.model_id,
+                            self.config.max_sequence_length,
+                            self.device,
+                            device_map=device_map,
+                            trust_remote_code=trust_remote_code)
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
    if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
-    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
+    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
+                                                              config=config,
+                                                              torch_dtype=torch_dtype,
+                                                              trust_remote_code=trust_remote_code,
+                                                              device_map=device_map,
+                                                              **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag,
+                                             model,
+                                             custom_objects={'tokenizer': tokenizer},
+                                             labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -60,10 +78,18 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
    torch_dtype = attrs.pop('torch_dtype', self.dtype)
    device_map = attrs.pop('device_map', None)
    trust_remote_code = attrs.pop('trust_remote_code', True)
-    config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
-    model = transformers.AutoModelForCausalLM.from_pretrained(
-        self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs
-    )
+    config = get_mpt_config(self._bentomodel.path,
+                            self.config.max_sequence_length,
+                            self.device,
+                            device_map=device_map,
+                            trust_remote_code=trust_remote_code,
+                           )
+    model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
+                                                              config=config,
+                                                              trust_remote_code=trust_remote_code,
+                                                              torch_dtype=torch_dtype,
+                                                              device_map=device_map,
+                                                              **attrs)
    model.tie_weights()
    return model

--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -16,29 +16,35 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
  __openllm_internal__ = True

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    config, tokenizer = transformers.AutoConfig.from_pretrained(
+        self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
-    return bentoml.transformers.save_model(
-        self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)
-    )
+    return bentoml.transformers.save_model(self.tag,
+                                           transformers.FlaxAutoModelForCausalLM.from_pretrained(
+                                               self.model_id, **attrs),
+                                           custom_objects={'tokenizer': tokenizer},
+                                           labels=generate_labels(self))

-  def sanitize_parameters(
-      self,
-      prompt: str,
-      max_new_tokens: int | None = None,
-      temperature: float | None = None,
-      top_k: int | None = None,
-      num_return_sequences: int | None = None,
-      repetition_penalty: float | None = None,
-      use_default_prompt_template: bool = False,
-      **attrs: t.Any
-  ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+  def sanitize_parameters(self,
+                          prompt: str,
+                          max_new_tokens: int | None = None,
+                          temperature: float | None = None,
+                          top_k: int | None = None,
+                          num_return_sequences: int | None = None,
+                          repetition_penalty: float | None = None,
+                          use_default_prompt_template: bool = False,
+                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
+        'max_new_tokens': max_new_tokens,
+        'temperature': temperature,
+        'top_k': top_k,
+        'num_return_sequences': num_return_sequences,
+        'repetition_penalty': repetition_penalty
    }, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(
-        self.model.generate(**self.tokenizer(prompt, return_tensors='np'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
-        skip_special_tokens=True
-    )
+    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(
+                                                               **attrs).to_generation_config()).sequences,
+                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -19,6 +19,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer
    import torch
    with torch.inference_mode():
      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-          skip_special_tokens=True
-      )
+          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                              do_sample=True,
+                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+          skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -11,17 +11,18 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
    import transformers
-    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    config, tokenizer = transformers.AutoConfig.from_pretrained(
+        self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
-    return bentoml.transformers.save_model(
-        self.tag,
-        transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
-        custom_objects={'tokenizer': tokenizer},
-        labels=generate_labels(self)
-    )
+    return bentoml.transformers.save_model(self.tag,
+                                           transformers.TFOPTForCausalLM.from_pretrained(
+                                               self.model_id, trust_remote_code=trust_remote_code, **attrs),
+                                           custom_objects={'tokenizer': tokenizer},
+                                           labels=generate_labels(self))

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    return self.tokenizer.batch_decode(
-        self.model.generate(**self.tokenizer(prompt, return_tensors='tf'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-        skip_special_tokens=True
-    )
+        self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
+                            do_sample=True,
+                            generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+        skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
@@ -10,16 +10,17 @@ class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
  __openllm_internal__ = True
  tokenizer_id = 'local'

-  def sanitize_parameters(
-      self,
-      prompt: str,
-      max_new_tokens: int | None = None,
-      temperature: float | None = None,
-      top_k: int | None = None,
-      num_return_sequences: int | None = None,
-      use_default_prompt_template: bool = True,
-      **attrs: t.Any
-  ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
+  def sanitize_parameters(self,
+                          prompt: str,
+                          max_new_tokens: int | None = None,
+                          temperature: float | None = None,
+                          top_k: int | None = None,
+                          num_return_sequences: int | None = None,
+                          use_default_prompt_template: bool = True,
+                          **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
+        'max_new_tokens': max_new_tokens,
+        'temperature': temperature,
+        'top_k': top_k,
+        'num_return_sequences': num_return_sequences
    }, {}
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -22,13 +22,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN
    with torch.inference_mode():
      return [
          self.tokenizer.decode(
-              self.model.generate(
-                  **self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                  do_sample=True,
-                  generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-                  pad_token_id=self.tokenizer.eos_token_id,
-                  stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])
-              )[0],
-              skip_special_tokens=True
-          )
+              self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                  do_sample=True,
+                                  generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+                                  pad_token_id=self.tokenizer.eos_token_id,
+                                  stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
+              skip_special_tokens=True)
      ]
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -18,17 +18,29 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    return {
+        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+        'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32
+    }, {}

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
    import torch
    import transformers
    torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
-    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
+    tokenizer.add_special_tokens({
+        'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
+        'pad_token': EOD
+    })
+    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
+                                                              torch_dtype=torch_dtype,
+                                                              device_map=device_map,
+                                                              **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag,
+                                             model,
+                                             custom_objects={'tokenizer': tokenizer},
+                                             labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -41,17 +53,22 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
          self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
          do_sample=True,
          pad_token_id=self.tokenizer.eos_token_id,
-          generation_config=self.config.model_construct_env(**attrs).to_generation_config()
-      )
+          generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      # TODO: We will probably want to return the tokenizer here so that we can manually process this
      # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
      return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

-  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str],
+                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
+        prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
+        'stopping_criteria', openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
-    result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    result = self.tokenizer.decode(
+        self.model.generate(encoded_inputs['input_ids'],
+                            max_new_tokens=max_new_tokens,
+                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/playground/falcon_tuned.py
+++ b/openllm-python/src/openllm/playground/falcon_tuned.py
@@ -56,20 +56,34 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
 else:
  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())

-model, tokenizer = openllm.AutoLLM.for_model("falcon", model_id=model_args.model_id, quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ensure_available=True).prepare_for_training(adapter_type="lora", lora_alpha=16, lora_dropout=0.1, r=16, bias="none", target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"])
+model, tokenizer = openllm.AutoLLM.for_model("falcon",
+                                             model_id=model_args.model_id,
+                                             quantize="int4",
+                                             bnb_4bit_quant_type="nf4",
+                                             bnb_4bit_compute_dtype=torch.float16,
+                                             ensure_available=True).prepare_for_training(adapter_type="lora",
+                                                                                         lora_alpha=16,
+                                                                                         lora_dropout=0.1,
+                                                                                         r=16,
+                                                                                         bias="none",
+                                                                                         target_modules=[
+                                                                                             "query_key_value", "dense",
+                                                                                             "dense_h_to_4h",
+                                                                                             "dense_4h_to_h"
+                                                                                         ])
 model.config.use_cache = False
 tokenizer.pad_token = tokenizer.eos_token

 dataset = load_dataset(DATASET_NAME, split="train")

-trainer = SFTTrainer(
-    model=model,
-    train_dataset=dataset,
-    dataset_text_field="text",
-    max_seq_length=model_args.max_sequence_length,
-    tokenizer=tokenizer,
-    args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
-)
+trainer = SFTTrainer(model=model,
+                     train_dataset=dataset,
+                     dataset_text_field="text",
+                     max_seq_length=model_args.max_sequence_length,
+                     tokenizer=tokenizer,
+                     args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
+                                              **dataclasses.asdict(training_args)),
+                    )

 # upcast layernorm in float32 for more stable training
 for name, module in trainer.model.named_modules():
--- a/openllm-python/src/openllm/playground/llama2_qlora.py
+++ b/openllm-python/src/openllm/playground/llama2_qlora.py
@@ -75,10 +75,13 @@ def chunk(sample, chunk_length=2048):

  # get max number of chunks for batch
  if batch_total_length >= chunk_length:
-    batch_chunk_length = (batch_total_length//chunk_length) * chunk_length
+    batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

  # Split by chunks of max_len.
-  result = {k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] for k, t in concatenated_examples.items()}
+  result = {
+      k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
+      for k, t in concatenated_examples.items()
+  }
  # add remainder to global variable for next batch
  remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
  # prepare labels
@@ -98,33 +101,39 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
  print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"])

  # tokenize and chunk dataset
-  lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True,)
+  lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]),
+                           batched=True,
+                           remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True)

  # Print total number of samples
  print(f"Total number of samples: {len(lm_dataset)}")
  return lm_dataset

-def prepare_for_int4_training(model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True,
-                              ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
+def prepare_for_int4_training(model_id: str,
+                              model_version: str | None = None,
+                              gradient_checkpointing: bool = True,
+                              bf16: bool = True,
+                             ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
  from peft.tuners.lora import LoraLayer

-  llm = openllm.AutoLLM.for_model(
-      "llama",
-      model_id=model_id,
-      model_version=model_version,
-      ensure_available=True,
-      quantize="int4",
-      bnb_4bit_compute_dtype=torch.bfloat16,
-      use_cache=not gradient_checkpointing,
-      device_map="auto",
-  )
+  llm = openllm.AutoLLM.for_model("llama",
+                                  model_id=model_id,
+                                  model_version=model_version,
+                                  ensure_available=True,
+                                  quantize="int4",
+                                  bnb_4bit_compute_dtype=torch.bfloat16,
+                                  use_cache=not gradient_checkpointing,
+                                  device_map="auto",
+                                 )
  print("Model summary:", llm.model)

  # get lora target modules
  modules = find_all_linear_names(llm.model)
  print(f"Found {len(modules)} modules to quantize: {modules}")

-  model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules)
+  model, tokenizer = llm.prepare_for_training(adapter_type="lora",
+                                              use_gradient_checkpointing=gradient_checkpointing,
+                                              target_modules=modules)

  # pre-process the model by upcasting the layer norms in float 32 for
  for name, module in model.named_modules():
@@ -177,15 +186,18 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):

  transformers.set_seed(model_args.seed)

-  model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16,)
+  model, tokenizer = prepare_for_int4_training(model_args.model_id,
+                                               gradient_checkpointing=training_args.gradient_checkpointing,
+                                               bf16=training_args.bf16,
+                                              )
  datasets = prepare_datasets(tokenizer)

-  trainer = transformers.Trainer(
-      model=model,
-      args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
-      train_dataset=datasets,
-      data_collator=transformers.default_data_collator,
-  )
+  trainer = transformers.Trainer(model=model,
+                                 args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
+                                                          **dataclasses.asdict(training_args)),
+                                 train_dataset=datasets,
+                                 data_collator=transformers.default_data_collator,
+                                )

  trainer.train()

@@ -200,10 +212,14 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
    del model, trainer
    torch.cuda.empty_cache()

-    model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
+    model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir,
+                                                          low_cpu_mem_usage=True,
+                                                          torch_dtype=torch.float16)
    # merge lora with base weights and save
    model = model.merge_and_unload()
-    model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB")
+    model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"),
+                          safe_serialization=True,
+                          max_shard_size="2GB")
  else:
    trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora"))

--- a/openllm-python/src/openllm/playground/opt_tuned.py
+++ b/openllm-python/src/openllm/playground/opt_tuned.py
@@ -26,13 +26,14 @@ if t.TYPE_CHECKING:

 DEFAULT_MODEL_ID = "facebook/opt-6.7b"

-def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments):
-  return transformers.Trainer(
-      model=model,
-      train_dataset=dataset_dict["train"],
-      args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)),
-      data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
-  )
+def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any,
+                 training_args: TrainingArguments):
+  return transformers.Trainer(model=model,
+                              train_dataset=dataset_dict["train"],
+                              args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir),
+                                                       **dataclasses.asdict(training_args)),
+                              data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+                             )

@dataclasses.dataclass
 class TrainingArguments:
@@ -57,7 +58,16 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
 else:
  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())

-model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", ensure_available=True).prepare_for_training(adapter_type="lora", r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
+model, tokenizer = openllm.AutoLLM.for_model("opt",
+                                             model_id=model_args.model_id,
+                                             quantize="int8",
+                                             ensure_available=True).prepare_for_training(
+                                                 adapter_type="lora",
+                                                 r=16,
+                                                 lora_alpha=32,
+                                                 target_modules=["q_proj", "v_proj"],
+                                                 lora_dropout=0.05,
+                                                 bias="none")

 # ft on english_quotes
 data = load_dataset("Abirate/english_quotes")
--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -64,10 +64,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
        raise openllm.exceptions.OpenLLMException(
            "Bento model does not have tokenizer. Make sure to save"
            " the tokenizer within the model via 'custom_objects'."
-            " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\""
-        ) from None
+            " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None
  else:
-    tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs)
+    tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath('/'),
+                                                               trust_remote_code=llm.__llm_trust_remote_code__,
+                                                               **tokenizer_attrs)

  if tokenizer.pad_token_id is None:
    if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id
@@ -77,12 +78,14 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
  return tokenizer

 class _Caller(t.Protocol[P]):
+
  def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
    ...

 _extras = ['get', 'import_model', 'save_pretrained', 'load_model']

 def _make_dispatch_function(fn: str) -> _Caller[P]:
+
  def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any:
    """Generic function dispatch to correct serialisation submodules based on LLM runtime.

--- a/openllm-python/src/openllm/serialisation/constants.py
+++ b/openllm-python/src/openllm/serialisation/constants.py
@@ -6,4 +6,7 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = {
    'flax': ('FlaxAutoModelForCausalLM', 'FlaxAutoModelForSeq2SeqLM'),
    'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM')
 }
-HUB_ATTRS = ['cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token']
+HUB_ATTRS = [
+    'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision',
+    'subfolder', 'use_auth_token'
+]
--- a/openllm-python/src/openllm/serialisation/ggml.py
+++ b/openllm-python/src/openllm/serialisation/ggml.py
@@ -13,7 +13,11 @@ if t.TYPE_CHECKING:

 _conversion_strategy = {'pt': 'ggml'}

-def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model:
+def import_model(llm: openllm.LLM[t.Any, t.Any],
+                 *decls: t.Any,
+                 trust_remote_code: bool = True,
+                 **attrs: t.Any,
+                ) -> bentoml.Model:
  raise NotImplementedError('Currently work in progress.')

 def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model:
@@ -27,9 +31,12 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo
  try:
    model = bentoml.models.get(llm.tag)
    if model.info.module not in ('openllm.serialisation.ggml', __name__):
-      raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
+      raise bentoml.exceptions.NotFound(
+          f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
+      )
    if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
-      raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
+      raise openllm.exceptions.OpenLLMException(
+          f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
    return model
  except bentoml.exceptions.NotFound:
    if auto_import:
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -46,7 +46,11 @@ logger = logging.getLogger(__name__)
 __all__ = ['import_model', 'get', 'load_model', 'save_pretrained']

@inject
-def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model:
+def import_model(llm: openllm.LLM[M, T],
+                 *decls: t.Any,
+                 trust_remote_code: bool,
+                 _model_store: ModelStore = Provide[BentoMLContainer.model_store],
+                 **attrs: t.Any) -> bentoml.Model:
  """Auto detect model type from given model_id and import it to bentoml's model store.

  For all kwargs, it will be parsed into `transformers.AutoConfig.from_pretrained` first,
@@ -67,80 +71,110 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
  config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
  _, tokenizer_attrs = llm.llm_parameters
  quantize_method = llm._quantize_method
-  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
+  safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'),
+                                                    default=llm._serialisation_format == 'safetensors')
  # Disable safe serialization with vLLM
  if llm.__llm_implementation__ == 'vllm': safe_serialisation = False
-  metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
+  metadata: DictStrAny = {
+      'safe_serialisation': safe_serialisation,
+      '_quantize': quantize_method is not None and quantize_method
+  }
  signatures: DictStrAny = {}

  if quantize_method == 'gptq':
    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-    if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
+      )
+    if llm.config['model_type'] != 'causal_lm':
+      raise openllm.exceptions.OpenLLMException(
+          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
    signatures['generate'] = {'batchable': False}
  else:
    # this model might be called with --quantize int4, therefore we need to pop this out
    # since saving int4 is not yet supported
-    if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False): attrs.pop('quantization_config')
+    if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
+      attrs.pop('quantization_config')
    if llm.__llm_implementation__ != 'flax': attrs['use_safetensors'] = safe_serialisation
    metadata['_framework'] = 'pt' if llm.__llm_implementation__ == 'vllm' else llm.__llm_implementation__

-  tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
+  tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id,
+                                                             trust_remote_code=trust_remote_code,
+                                                             **hub_attrs,
+                                                             **tokenizer_attrs)
  if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

  external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)]
  imported_modules: list[types.ModuleType] = []
-  bentomodel = bentoml.Model.create(
-      llm.tag,
-      module='openllm.serialisation.transformers',
-      api_version='v1',
-      options=ModelOptions(),
-      context=openllm.utils.generate_context(framework_name='openllm'),
-      labels=openllm.utils.generate_labels(llm),
-      signatures=signatures if signatures else make_model_signatures(llm)
-  )
+  bentomodel = bentoml.Model.create(llm.tag,
+                                    module='openllm.serialisation.transformers',
+                                    api_version='v1',
+                                    options=ModelOptions(),
+                                    context=openllm.utils.generate_context(framework_name='openllm'),
+                                    labels=openllm.utils.generate_labels(llm),
+                                    signatures=signatures if signatures else make_model_signatures(llm))
  with openllm.utils.analytics.set_bentoml_tracking():
    try:
      bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
      tokenizer.save_pretrained(bentomodel.path)
      if quantize_method == 'gptq':
        if not openllm.utils.is_autogptq_available():
-          raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-        if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+          raise openllm.exceptions.OpenLLMException(
+              "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
+          )
+        if llm.config['model_type'] != 'causal_lm':
+          raise openllm.exceptions.OpenLLMException(
+              f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
        logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
-        model = autogptq.AutoGPTQForCausalLM.from_quantized(
-            llm.model_id,
-            *decls,
-            quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
-            trust_remote_code=trust_remote_code,
-            use_safetensors=safe_serialisation,
-            **hub_attrs,
-            **attrs,
-        )
-        update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
+        model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
+                                                            *decls,
+                                                            quantize_config=t.cast('autogptq.BaseQuantizeConfig',
+                                                                                   llm.quantization_config),
+                                                            trust_remote_code=trust_remote_code,
+                                                            use_safetensors=safe_serialisation,
+                                                            **hub_attrs,
+                                                            **attrs,
+                                                           )
+        update_model(bentomodel,
+                     metadata={
+                         '_pretrained_class': model.__class__.__name__,
+                         '_framework': model.model.framework
+                     })
        model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
      else:
        architectures = getattr(config, 'architectures', [])
        if not architectures:
-          raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
+          raise RuntimeError(
+              'Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`'
+          )
        architecture = architectures[0]
        update_model(bentomodel, metadata={'_pretrained_class': architecture})
        if llm._local:
          # possible local path
          logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
-          model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
+          model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id,
+                                                                        *decls,
+                                                                        config=config,
+                                                                        trust_remote_code=trust_remote_code,
+                                                                        **hub_attrs,
+                                                                        **attrs)
          # for trust_remote_code to work
          bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
          model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
        else:
          # we will clone the all tings into the bentomodel path without loading model into memory
-          snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
+          snapshot_download(llm.model_id,
+                            local_dir=bentomodel.path,
+                            local_dir_use_symlinks=False,
+                            ignore_patterns=HfIgnore.ignore_patterns(llm))
    except Exception:
      raise
    else:
      bentomodel.flush()  # type: ignore[no-untyped-call]
      bentomodel.save(_model_store)
-      openllm.utils.analytics.track(openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
+      openllm.utils.analytics.track(
+          openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module,
+                                                 model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024))
    finally:
      bentomodel.exit_cloudpickle_context(imported_modules)
      # NOTE: We need to free up the cache after importing the model
@@ -158,13 +192,15 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
  '''
  try:
    model = bentoml.models.get(llm.tag)
-    if model.info.module not in (
-        'openllm.serialisation.transformers'
-        'bentoml.transformers', 'bentoml._internal.frameworks.transformers', __name__
-    ):  # NOTE: backward compatible with previous version of OpenLLM.
-      raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
+    if model.info.module not in ('openllm.serialisation.transformers'
+                                 'bentoml.transformers', 'bentoml._internal.frameworks.transformers',
+                                 __name__):  # NOTE: backward compatible with previous version of OpenLLM.
+      raise bentoml.exceptions.NotFound(
+          f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'."
+      )
    if 'runtime' in model.info.labels and model.info.labels['runtime'] != llm.runtime:
-      raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
+      raise openllm.exceptions.OpenLLMException(
+          f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")
    return model
  except bentoml.exceptions.NotFound as err:
    if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__)
@@ -177,44 +213,50 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
  If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
  '''
  config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs)
-  safe_serialization = openllm.utils.first_not_none(
-      t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)), attrs.pop('safe_serialization', None), default=llm._serialisation_format == 'safetensors'
-  )
+  safe_serialization = openllm.utils.first_not_none(t.cast(
+      t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
+                                                    attrs.pop('safe_serialization', None),
+                                                    default=llm._serialisation_format == 'safetensors')
  if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-    if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    return autogptq.AutoGPTQForCausalLM.from_quantized(
-        llm._bentomodel.path,
-        *decls,
-        quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
-        trust_remote_code=llm.__llm_trust_remote_code__,
-        use_safetensors=safe_serialization,
-        **hub_attrs,
-        **attrs
-    )
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
+      )
+    if llm.config['model_type'] != 'causal_lm':
+      raise openllm.exceptions.OpenLLMException(
+          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+    return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
+                                                       *decls,
+                                                       quantize_config=t.cast('autogptq.BaseQuantizeConfig',
+                                                                              llm.quantization_config),
+                                                       trust_remote_code=llm.__llm_trust_remote_code__,
+                                                       use_safetensors=safe_serialization,
+                                                       **hub_attrs,
+                                                       **attrs)

  device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
-  model = infer_autoclass_from_llm(llm, config).from_pretrained(
-      llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs
-  ).eval()
+  model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
+                                                                *decls,
+                                                                config=config,
+                                                                trust_remote_code=llm.__llm_trust_remote_code__,
+                                                                device_map=device_map,
+                                                                **hub_attrs,
+                                                                **attrs).eval()
  # BetterTransformer is currently only supported on PyTorch.
  if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
  if llm.__llm_implementation__ in {'pt', 'vllm'}: check_unintialised_params(model)
  return t.cast('M', model)

-def save_pretrained(
-    llm: openllm.LLM[M, T],
-    save_directory: str,
-    is_main_process: bool = True,
-    state_dict: DictStrAny | None = None,
-    save_function: t.Any | None = None,
-    push_to_hub: bool = False,
-    max_shard_size: int | str = '10GB',
-    safe_serialization: bool = False,
-    variant: str | None = None,
-    **attrs: t.Any
-) -> None:
+def save_pretrained(llm: openllm.LLM[M, T],
+                    save_directory: str,
+                    is_main_process: bool = True,
+                    state_dict: DictStrAny | None = None,
+                    save_function: t.Any | None = None,
+                    push_to_hub: bool = False,
+                    max_shard_size: int | str = '10GB',
+                    safe_serialization: bool = False,
+                    variant: str | None = None,
+                    **attrs: t.Any) -> None:
  save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save))
  model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs)
  safe_serialization = safe_serialization or llm._serialisation_format == 'safetensors'
@@ -222,25 +264,31 @@ def save_pretrained(
  if llm.__llm_implementation__ == 'vllm': safe_serialization = False
  if llm._quantize_method == 'gptq':
    if not openllm.utils.is_autogptq_available():
-      raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
-    if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
-    if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM): raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
-    t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory, use_safetensors=safe_serialization)
+      raise openllm.exceptions.OpenLLMException(
+          "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'"
+      )
+    if llm.config['model_type'] != 'causal_lm':
+      raise openllm.exceptions.OpenLLMException(
+          f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
+    if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM):
+      raise ValueError(f'Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})')
+    t.cast('autogptq.modeling.BaseGPTQForCausalLM', llm.model).save_quantized(save_directory,
+                                                                              use_safetensors=safe_serialization)
  elif openllm.utils.LazyType['vllm.LLMEngine']('vllm.LLMEngine').isinstance(llm.model):
-    raise RuntimeError("vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized.")
+    raise RuntimeError(
+        "vllm.LLMEngine cannot be serialisation directly. This happens when 'save_pretrained' is called directly after `openllm.AutoVLLM` is initialized."
+    )
  elif isinstance(llm.model, transformers.Pipeline):
    llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
  else:
    # We can safely cast here since it will be the PreTrainedModel protocol.
-    t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(
-        save_directory,
-        is_main_process=is_main_process,
-        state_dict=state_dict,
-        save_function=save_function,
-        push_to_hub=push_to_hub,
-        max_shard_size=max_shard_size,
-        safe_serialization=safe_serialization,
-        variant=variant,
-        **model_save_attrs
-    )
+    t.cast('transformers.PreTrainedModel', llm.model).save_pretrained(save_directory,
+                                                                      is_main_process=is_main_process,
+                                                                      state_dict=state_dict,
+                                                                      save_function=save_function,
+                                                                      push_to_hub=push_to_hub,
+                                                                      max_shard_size=max_shard_size,
+                                                                      safe_serialization=safe_serialization,
+                                                                      variant=variant,
+                                                                      **model_save_attrs)
  llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs)
--- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py
+++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py
@@ -23,11 +23,14 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import M
  from openllm_core._typing_compat import T
 else:
-  transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
+  transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(),
+                                                      'transformers'), openllm_core.utils.LazyLoader(
+                                                          'torch', globals(), 'torch')

 _object_setattr = object.__setattr__

-def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
+def process_config(model_id: str, trust_remote_code: bool,
+                   **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
  '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.

  Args:
@@ -44,19 +47,27 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu
  if not isinstance(config, transformers.PretrainedConfig):
    copied_attrs = copy.deepcopy(attrs)
    if copied_attrs.get('torch_dtype', None) == 'auto': copied_attrs.pop('torch_dtype')
-    config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs)
+    config, attrs = transformers.AutoConfig.from_pretrained(model_id,
+                                                            return_unused_kwargs=True,
+                                                            trust_remote_code=trust_remote_code,
+                                                            **hub_attrs,
+                                                            **copied_attrs)
  return config, hub_attrs, attrs

 def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T:
-  __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
-  if __cls is None: raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
+  __cls = getattr(transformers,
+                  openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None)
+  if __cls is None:
+    raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`')
  return __cls

 def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass:
  if llm.config['trust_remote_code']:
    autoclass = 'AutoModelForSeq2SeqLM' if llm.config['model_type'] == 'seq2seq_lm' else 'AutoModelForCausalLM'
    if not hasattr(config, 'auto_map'):
-      raise ValueError(f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping')
+      raise ValueError(
+          f'Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping'
+      )
    # in case this model doesn't use the correct auto class for model type, for example like chatglm
    # where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel
    if autoclass not in config.auto_map: autoclass = 'AutoModel'
@@ -69,14 +80,24 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra

 def check_unintialised_params(model: torch.nn.Module) -> None:
  unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device('meta')]
-  if len(unintialized) > 0: raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
+  if len(unintialized) > 0:
+    raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')

 def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
  based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
  based.update(metadata)
-  _object_setattr(bentomodel, '_info', ModelInfo(  # type: ignore[call-arg] # XXX: remove me once upstream is merged
-      tag=bentomodel.info.tag, module=bentomodel.info.module, labels=bentomodel.info.labels, options=bentomodel.info.options.to_dict(), signatures=bentomodel.info.signatures, context=bentomodel.info.context, api_version=bentomodel.info.api_version, creation_time=bentomodel.info.creation_time, metadata=based
-  ))
+  _object_setattr(
+      bentomodel, '_info',
+      ModelInfo(  # type: ignore[call-arg] # XXX: remove me once upstream is merged
+          tag=bentomodel.info.tag,
+          module=bentomodel.info.module,
+          labels=bentomodel.info.labels,
+          options=bentomodel.info.options.to_dict(),
+          signatures=bentomodel.info.signatures,
+          context=bentomodel.info.context,
+          api_version=bentomodel.info.api_version,
+          creation_time=bentomodel.info.creation_time,
+          metadata=based))
  return bentomodel

 # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
@@ -84,9 +105,13 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
  infer_fn: tuple[str, ...] = ('__call__',)
  default_config = ModelSignature(batchable=False)
  if llm.__llm_implementation__ in {'pt', 'vllm'}:
-    infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', 'constrained_beam_search',)
+    infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample',
+                 'group_beam_search', 'constrained_beam_search',
+                )
  elif llm.__llm_implementation__ == 'tf':
-    infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',)
+    infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search',
+                 'contrastive_search',
+                )
  else:
    infer_fn += ('generate',)
  return {k: default_config for k in infer_fn}
--- a/openllm-python/src/openllm/serialisation/transformers/weights.py
+++ b/openllm-python/src/openllm/serialisation/transformers/weights.py
@@ -25,7 +25,8 @@ class HfIgnore:
  def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]:
    if llm.__llm_implementation__ == 'vllm': base = [cls.tf, cls.flax, cls.safetensors]
    elif llm.__llm_implementation__ == 'tf': base = [cls.flax, cls.pt]
-    elif llm.__llm_implementation__ == 'flax': base = [cls.tf, cls.pt, cls.safetensors]  # as of current, safetensors is not supported with flax
+    elif llm.__llm_implementation__ == 'flax':
+      base = [cls.tf, cls.pt, cls.safetensors]  # as of current, safetensors is not supported with flax
    else:
      base = [cls.tf, cls.flax]
      if has_safetensors_weights(llm.model_id): base.append(cls.pt)
--- a/openllm-python/src/openllm/testing.py
+++ b/openllm-python/src/openllm/testing.py
@@ -15,9 +15,11 @@ if t.TYPE_CHECKING:
 logger = logging.getLogger(__name__)

@contextlib.contextmanager
-def build_bento(
-    model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, runtime: t.Literal['ggml', 'transformers'] = 'transformers', cleanup: bool = False
-) -> t.Iterator[bentoml.Bento]:
+def build_bento(model: str,
+                model_id: str | None = None,
+                quantize: t.Literal['int4', 'int8', 'gptq'] | None = None,
+                runtime: t.Literal['ggml', 'transformers'] = 'transformers',
+                cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
  logger.info('Building BentoML for %s', model)
  bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime)
  yield bento
@@ -26,7 +28,10 @@ def build_bento(
    bentoml.bentos.delete(bento.tag)

@contextlib.contextmanager
-def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]:
+def build_container(bento: bentoml.Bento | str | bentoml.Tag,
+                    image_tag: str | None = None,
+                    cleanup: bool = False,
+                    **attrs: t.Any) -> t.Iterator[str]:
  if isinstance(bento, bentoml.Bento): bento_tag = bento.tag
  else: bento_tag = bentoml.Tag.from_taglike(bento)
  if image_tag is None: image_tag = str(bento_tag)
@@ -42,22 +47,23 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | N
      subprocess.check_output([executable, 'rmi', '-f', image_tag])

@contextlib.contextmanager
-def prepare(
-    model: str,
-    model_id: str | None = None,
-    implementation: LiteralRuntime = 'pt',
-    deployment_mode: t.Literal['container', 'local'] = 'local',
-    clean_context: contextlib.ExitStack | None = None,
-    cleanup: bool = True
-) -> t.Iterator[str]:
+def prepare(model: str,
+            model_id: str | None = None,
+            implementation: LiteralRuntime = 'pt',
+            deployment_mode: t.Literal['container', 'local'] = 'local',
+            clean_context: contextlib.ExitStack | None = None,
+            cleanup: bool = True) -> t.Iterator[str]:
  if clean_context is None:
    clean_context = contextlib.ExitStack()
    cleanup = True
  llm = openllm.infer_auto_class(implementation).for_model(model, model_id=model_id, ensure_available=True)
  bento_tag = bentoml.Tag.from_taglike(f'{llm.llm_type}-service:{llm.tag.version}')
-  if not bentoml.list(bento_tag): bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup))
-  else: bento = bentoml.get(bento_tag)
+  if not bentoml.list(bento_tag):
+    bento = clean_context.enter_context(build_bento(model, model_id=model_id, cleanup=cleanup))
+  else:
+    bento = bentoml.get(bento_tag)
  container_name = f'openllm-{model}-{llm.llm_type}'.replace('-', '_')
-  if deployment_mode == 'container': container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup))
+  if deployment_mode == 'container':
+    container_name = clean_context.enter_context(build_container(bento, image_tag=container_name, cleanup=cleanup))
  yield container_name
  if cleanup: clean_context.close()
--- a/openllm-python/src/openllm/utils/init.py
+++ b/openllm-python/src/openllm/utils/init.py
@@ -19,9 +19,17 @@ if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralRuntime

 def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
-  return {'runtime': llm.runtime, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation_format': llm._serialisation_format}
+  return {
+      'runtime': llm.runtime,
+      'framework': 'openllm',
+      'model_name': llm.config['model_name'],
+      'architecture': llm.config['architecture'],
+      'serialisation_format': llm._serialisation_format
+  }

-def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
+def infer_auto_class(
+    implementation: LiteralRuntime
+) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
  import openllm
  if implementation == 'tf': return openllm.AutoTFLLM
  elif implementation == 'flax': return openllm.AutoFlaxLLM
@@ -29,7 +37,10 @@ def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | o
  elif implementation == 'vllm': return openllm.AutoVLLM
  else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')")

-__all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects']
+__all__ = [
+    'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects',
+    'dummy_vllm_objects'
+]

 def __dir__() -> t.Sequence[str]:
  return sorted(__all__)
--- a/openllm-python/tests/_strategies/_configuration.py
+++ b/openllm-python/tests/_strategies/_configuration.py
@@ -16,19 +16,32 @@ env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_na
 def model_settings(draw: st.DrawFn):
  '''Strategy for generating ModelSettings objects.'''
  kwargs: dict[str, t.Any] = {
-      'default_id': st.text(min_size=1),
-      'model_ids': st.lists(st.text(), min_size=1),
-      'architecture': st.text(min_size=1),
-      'url': st.text(),
-      'requires_gpu': st.booleans(),
-      'trust_remote_code': st.booleans(),
-      'requirements': st.none() | st.lists(st.text(), min_size=1),
-      'default_implementation': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
-      'model_type': st.sampled_from(['causal_lm', 'seq2seq_lm']),
-      'runtime': st.sampled_from(['transformers', 'ggml']),
-      'name_type': st.sampled_from(['dasherize', 'lowercase']),
-      'timeout': st.integers(min_value=3600),
-      'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
+      'default_id':
+          st.text(min_size=1),
+      'model_ids':
+          st.lists(st.text(), min_size=1),
+      'architecture':
+          st.text(min_size=1),
+      'url':
+          st.text(),
+      'requires_gpu':
+          st.booleans(),
+      'trust_remote_code':
+          st.booleans(),
+      'requirements':
+          st.none() | st.lists(st.text(), min_size=1),
+      'default_implementation':
+          st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])),
+      'model_type':
+          st.sampled_from(['causal_lm', 'seq2seq_lm']),
+      'runtime':
+          st.sampled_from(['transformers', 'ggml']),
+      'name_type':
+          st.sampled_from(['dasherize', 'lowercase']),
+      'timeout':
+          st.integers(min_value=3600),
+      'workers_per_resource':
+          st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)),
  }
  return draw(st.builds(ModelSettings, **kwargs))

--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -24,19 +24,29 @@ from ._strategies._configuration import make_llm_config
 from ._strategies._configuration import model_settings

 # XXX: @aarnphm fixes TypedDict behaviour in 3.11
-@pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this')
+@pytest.mark.skipif(sys.version_info[:2] == (3, 11),
+                    reason='TypedDict in 3.11 behaves differently, so we need to fix this')
 def test_missing_default():
  with pytest.raises(ValueError, match='Missing required fields *'):
    make_llm_config('MissingDefaultId', {'name_type': 'lowercase', 'requirements': ['bentoml']})
  with pytest.raises(ValueError, match='Missing required fields *'):
    make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']})
  with pytest.raises(ValueError, match='Missing required fields *'):
-    make_llm_config('MissingArchitecture', {'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing'], 'requirements': ['bentoml'],},)
+    make_llm_config(
+        'MissingArchitecture', {
+            'default_id': 'huggingface/t5-tiny-testing',
+            'model_ids': ['huggingface/t5-tiny-testing'],
+            'requirements': ['bentoml'],
+        },
+    )

 def test_forbidden_access():
  cl_ = make_llm_config(
      'ForbiddenAccess', {
-          'default_id': 'huggingface/t5-tiny-testing', 'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'], 'architecture': 'PreTrainedModel', 'requirements': ['bentoml'],
+          'default_id': 'huggingface/t5-tiny-testing',
+          'model_ids': ['huggingface/t5-tiny-testing', 'bentoml/t5-tiny-testing'],
+          'architecture': 'PreTrainedModel',
+          'requirements': ['bentoml'],
      },
  )

@@ -69,9 +79,16 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
  cl_ = make_llm_config('AttrsProtocolLLM', gen_settings)
  assert attr.has(cl_)

-@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),)
-def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
-  cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),)
+@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
+       st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),
+      )
+def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int,
+                             input_temperature: float):
+  cl_ = make_llm_config('ComplexLLM',
+                        gen_settings,
+                        fields=(('field1', 'float', field1),),
+                        generation_fields=(('temperature', temperature),),
+                       )
  sent = cl_()
  assert sent.model_dump()['field1'] == field1
  assert sent.model_dump()['generation_config']['temperature'] == temperature
@@ -94,7 +111,10 @@ def patch_env(**attrs: t.Any):
    yield

 def test_struct_envvar():
-  with patch_env(**{field_env_key('env_llm', 'field1'): '4', field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',}):
+  with patch_env(**{
+      field_env_key('env_llm', 'field1'): '4',
+      field_env_key('env_llm', 'temperature', suffix='generation'): '0.2',
+  }):

    class EnvLLM(openllm.LLMConfig):
      __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
@@ -112,6 +132,7 @@ def test_struct_envvar():
    assert overwrite_default['temperature'] == 0.2

 def test_struct_provided_fields():
+
  class EnvLLM(openllm.LLMConfig):
    __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',}
    field1: int = 2
@@ -127,11 +148,13 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat
  with monkeypatch.context() as mk:
    mk.setenv(field_env_key('overwrite_with_env_available', 'field1'), str(4.0))
    mk.setenv(field_env_key('overwrite_with_env_available', 'temperature', suffix='generation'), str(0.2))
-    sent = make_llm_config(
-        'OverwriteWithEnvAvailable', {
-            'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel'
-        }, fields=(('field1', 'float', 3.0),),
-    ).model_construct_env(field1=20.0, temperature=0.4)
+    sent = make_llm_config('OverwriteWithEnvAvailable', {
+        'default_id': 'asdfasdf',
+        'model_ids': ['asdf', 'asdfasdfads'],
+        'architecture': 'PreTrainedModel'
+    },
+                           fields=(('field1', 'float', 3.0),),
+                          ).model_construct_env(field1=20.0, temperature=0.4)
    assert sent.generation_config.temperature == 0.4
    assert sent.field1 == 20.0

--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -10,23 +10,37 @@ import openllm
 if t.TYPE_CHECKING:
  from openllm_core._typing_compat import LiteralRuntime

-_FRAMEWORK_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',}
-_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',}
+_FRAMEWORK_MAPPING = {
+    'flan_t5': 'google/flan-t5-small',
+    'opt': 'facebook/opt-125m',
+    'baichuan': 'baichuan-inc/Baichuan-7B',
+}
+_PROMPT_MAPPING = {
+    'qa':
+        'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',
+}

-def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
+def parametrise_local_llm(
+    model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]:
  if model not in _FRAMEWORK_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.")
  runtime_impl: tuple[LiteralRuntime, ...] = tuple()
  if model in openllm.MODEL_MAPPING_NAMES: runtime_impl += ('pt',)
  if model in openllm.MODEL_FLAX_MAPPING_NAMES: runtime_impl += ('flax',)
  if model in openllm.MODEL_TF_MAPPING_NAMES: runtime_impl += ('tf',)
  for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()):
-    llm = openllm.Runner(model, model_id=_FRAMEWORK_MAPPING[model], ensure_available=True, implementation=framework, init_local=True,)
+    llm = openllm.Runner(model,
+                         model_id=_FRAMEWORK_MAPPING[model],
+                         ensure_available=True,
+                         implementation=framework,
+                         init_local=True,
+                        )
    yield prompt, llm

 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
  if os.getenv('GITHUB_ACTIONS') is None:
    if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
-      metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
+      metafunc.parametrize('prompt,llm',
+                           [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])

 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
  # If no tests are collected, pytest exists with code 5, which makes the CI fail.
--- a/openllm-python/tests/models/conftest.py
+++ b/openllm-python/tests/models/conftest.py
@@ -40,7 +40,13 @@ if t.TYPE_CHECKING:
  from openllm.client import BaseAsyncClient

 class ResponseComparator(JSONSnapshotExtension):
-  def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData:
+
+  def serialize(self,
+                data: SerializableData,
+                *,
+                exclude: PropertyFilter | None = None,
+                matcher: PropertyMatcher | None = None,
+               ) -> SerializedData:
    if openllm.utils.LazyType(ListAny).isinstance(data):
      data = [d.unmarshaled for d in data]
    else:
@@ -49,6 +55,7 @@ class ResponseComparator(JSONSnapshotExtension):
    return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()

  def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool:
+
    def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]:
      try:
        data = orjson.loads(data)
@@ -73,9 +80,11 @@ class ResponseComparator(JSONSnapshotExtension):
      return s == t

    def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool:
-      return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config))
+      return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and
+              eq_config(s.marshaled_config, t.marshaled_config))

-    return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])
+    return len(serialized_data) == len(snapshot_data) and all(
+        [eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)])

@pytest.fixture()
 def response_snapshot(snapshot: SnapshotAssertion):
@@ -124,8 +133,14 @@ class LocalHandle(_Handle):
    return self.process.poll() is None

 class HandleProtocol(t.Protocol):
+
  @contextlib.contextmanager
-  def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]:
+  def __call__(*,
+               model: str,
+               model_id: str,
+               image_tag: str,
+               quantize: t.AnyStr | None = None,
+              ) -> t.Generator[_Handle, None, None]:
    ...

@attr.define(init=False)
@@ -133,7 +148,9 @@ class DockerHandle(_Handle):
  container_name: str
  docker_client: docker.DockerClient

-  def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int, deployment_mode: t.Literal['container', 'local'],):
+  def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int,
+               deployment_mode: t.Literal['container', 'local'],
+              ):
    self.__attrs_init__(port, deployment_mode, container_name, docker_client)

  def status(self) -> bool:
@@ -141,16 +158,29 @@ class DockerHandle(_Handle):
    return container.status in ['running', 'created']

@contextlib.contextmanager
-def _local_handle(
-    model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False,
-):
+def _local_handle(model: str,
+                  model_id: str,
+                  image_tag: str,
+                  deployment_mode: t.Literal['container', 'local'],
+                  quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+                  *,
+                  _serve_grpc: bool = False,
+                 ):
  with openllm.utils.reserve_free_port() as port:
    pass

  if not _serve_grpc:
-    proc = openllm.start(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
+    proc = openllm.start(model,
+                         model_id=model_id,
+                         quantize=quantize,
+                         additional_args=['--port', str(port)],
+                         __test__=True)
  else:
-    proc = openllm.start_grpc(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True)
+    proc = openllm.start_grpc(model,
+                              model_id=model_id,
+                              quantize=quantize,
+                              additional_args=['--port', str(port)],
+                              __test__=True)

  yield LocalHandle(proc, port, deployment_mode)
  proc.terminate()
@@ -164,9 +194,14 @@ def _local_handle(
    proc.stderr.close()

@contextlib.contextmanager
-def _container_handle(
-    model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False,
-):
+def _container_handle(model: str,
+                      model_id: str,
+                      image_tag: str,
+                      deployment_mode: t.Literal['container', 'local'],
+                      quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
+                      *,
+                      _serve_grpc: bool = False,
+                     ):
  envvar = openllm.utils.EnvVarMixin(model)

  with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
@@ -191,11 +226,18 @@ def _container_handle(
  gpus = openllm.utils.device_count() or -1
  devs = [docker.types.DeviceRequest(count=gpus, capabilities=[['gpu']])] if gpus > 0 else None

-  container = client.containers.run(
-      image_tag, command=args, name=container_name, environment=env, auto_remove=False, detach=True, device_requests=devs, ports={
-          '3000/tcp': port, '3001/tcp': prom_port
-      },
-  )
+  container = client.containers.run(image_tag,
+                                    command=args,
+                                    name=container_name,
+                                    environment=env,
+                                    auto_remove=False,
+                                    detach=True,
+                                    device_requests=devs,
+                                    ports={
+                                        '3000/tcp': port,
+                                        '3001/tcp': prom_port
+                                    },
+                                   )

  yield DockerHandle(client, container.name, port, deployment_mode)

--- a/openllm-python/tests/models/flan_t5_test.py
+++ b/openllm-python/tests/models/flan_t5_test.py
@@ -16,8 +16,11 @@ model = 'flan_t5'
 model_id = 'google/flan-t5-small'

@pytest.fixture(scope='module')
-def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
-  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
+def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
+                   clean_context: contextlib.ExitStack,
+                  ):
+  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
+                               clean_context=clean_context) as image_tag:
    with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
      yield handle

--- a/openllm-python/tests/models/opt_test.py
+++ b/openllm-python/tests/models/opt_test.py
@@ -16,8 +16,11 @@ model = 'opt'
 model_id = 'facebook/opt-125m'

@pytest.fixture(scope='module')
-def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,):
-  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag:
+def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'],
+                    clean_context: contextlib.ExitStack,
+                   ):
+  with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode,
+                               clean_context=clean_context) as image_tag:
    with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
      yield handle

--- a/openllm-python/tests/package_test.py
+++ b/openllm-python/tests/package_test.py
@@ -15,7 +15,9 @@ if t.TYPE_CHECKING:
 HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5'

 actions_xfail = functools.partial(
-    pytest.mark.xfail, condition=os.getenv('GITHUB_ACTIONS') is not None, reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
+    pytest.mark.xfail,
+    condition=os.getenv('GITHUB_ACTIONS') is not None,
+    reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.',
 )

@actions_xfail
@@ -46,7 +48,9 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory):
@pytest.fixture()
 def dockerfile_template(tmp_path_factory: pytest.TempPathFactory):
  file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template'
-  file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}")
+  file.write_text(
+      "{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}"
+  )
  return file

@pytest.mark.usefixtures('dockerfile_template')
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -71,9 +71,11 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
    mcls.setenv('CUDA_VISIBLE_DEVICES', '')
    assert len(NvidiaGpuResource.from_system()) >= 0  # TODO: real from_system tests

-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.')
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],
+                        ).match('Input list should be all string type.')
    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate,
+                         ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')

 def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):
  with monkeypatch.context() as mcls: