fix(base-image): update base image to include cuda for now (#720)

* fix(base-image): update base image to include cuda for now Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: build core and client on release images Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: cleanup style changes Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-05 07:36:15 -05:00 · 2023-11-22 01:15:19 -05:00
parent 8bb2742a9a
commit 38b7c44df0
41 changed files with 913 additions and 613 deletions
--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -1,4 +1,4 @@
-"""OpenLLM.
+'''OpenLLM.
 ===========

 An open platform for operating large language models in production.
@@ -8,13 +8,11 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
 * Option to bring your own fine-tuned LLMs
 * Online Serving with HTTP, gRPC, SSE or custom API
 * Native integration with BentoML, LangChain, OpenAI compatible endpoints, LlamaIndex for custom LLM apps
-"""
+'''

-# fmt: off
 # update-config-stubs.py: import stubs start
-from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING,CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,AutoConfig as AutoConfig,BaichuanConfig as BaichuanConfig,ChatGLMConfig as ChatGLMConfig,DollyV2Config as DollyV2Config,FalconConfig as FalconConfig,FlanT5Config as FlanT5Config,GPTNeoXConfig as GPTNeoXConfig,LlamaConfig as LlamaConfig,MistralConfig as MistralConfig,MPTConfig as MPTConfig,OPTConfig as OPTConfig,PhiConfig as PhiConfig,StableLMConfig as StableLMConfig,StarCoderConfig as StarCoderConfig,YiConfig as YiConfig
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
 # update-config-stubs.py: import stubs stop
-# fmt: on

 from openllm_cli._sdk import (
  build as build,
--- a/openllm-python/src/openllm/main.py
+++ b/openllm-python/src/openllm/main.py
@@ -1,2 +1,4 @@
-# fmt: off
-if __name__ == '__main__':from openllm_cli.entrypoint import cli;cli()
+if __name__ == '__main__':
+  from openllm_cli.entrypoint import cli
+
+  cli()
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -59,7 +59,7 @@ def Runner(
      "'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
    )
  model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
-  _RUNNER_MSG = f"""\
+  _RUNNER_MSG = f'''\
  Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:

  ```python
@@ -71,7 +71,7 @@ def Runner(
  async def chat(input: str) -> str:
    async for it in llm.generate_iterator(input): print(it)
  ```
-    """
+    '''
  warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
  attrs.update(
    {
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -33,7 +33,7 @@ def is_sentence_complete(output):


 def is_partial_stop(output, stop_str):
-  """Check whether the output contains a partial stop str."""
+  '''Check whether the output contains a partial stop str.'''
  for i in range(min(len(output), len(stop_str))):
    if stop_str.startswith(output[-i:]):
      return True
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
-import functools, logging, os, warnings
-import typing as t
-import attr, inflection, orjson
-import bentoml, openllm
+import functools, logging, os, warnings, typing as t
+import attr, inflection, orjson, bentoml, openllm
 from openllm_core._schemas import GenerationOutput
 from openllm_core._typing_compat import (
  AdapterMap,
@@ -20,9 +18,9 @@ from openllm_core._typing_compat import (
 from openllm_core.exceptions import MissingDependencyError
 from openllm_core.utils import (
  DEBUG,
-  ENV_VARS_TRUE_VALUES,
  ReprMixin,
  apply,
+  check_bool_env,
  codegen,
  first_not_none,
  flatten_attrs,
@@ -142,31 +140,33 @@ class LLM(t.Generic[M, T], ReprMixin):

  # NOTE: If you are here to see how generate_iterator and generate works, see above.
  # The below are mainly for internal implementation that you don't have to worry about.
-  # fmt: off
-
-  _model_id:str
-  _revision:t.Optional[str]
-  _quantization_config:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]
+  _model_id: str
+  _revision: t.Optional[str]
+  _quantization_config: t.Optional[
+    t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
+  ]
  _quantise: t.Optional[LiteralQuantise]
-  _model_decls:TupleAny
-  __model_attrs:DictStrAny
-  __tokenizer_attrs:DictStrAny
-  _tag:bentoml.Tag
-  _adapter_map:t.Optional[AdapterMap]
-  _serialisation:LiteralSerialisation
-  _local:bool
-  _max_model_len:t.Optional[int]
+  _model_decls: TupleAny
+  __model_attrs: DictStrAny
+  __tokenizer_attrs: DictStrAny
+  _tag: bentoml.Tag
+  _adapter_map: t.Optional[AdapterMap]
+  _serialisation: LiteralSerialisation
+  _local: bool
+  _max_model_len: t.Optional[int]

-  __llm_dtype__: t.Union[LiteralDtype,t.Literal['auto', 'half', 'float']]='auto'
-  __llm_torch_dtype__:'torch.dtype'=None
-  __llm_config__:t.Optional[LLMConfig]=None
-  __llm_backend__:LiteralBackend=None
-  __llm_quantization_config__:t.Optional[t.Union[transformers.BitsAndBytesConfig,transformers.GPTQConfig,transformers.AwqConfig]]=None
-  __llm_runner__:t.Optional[Runner[M, T]]=None
-  __llm_model__:t.Optional[M]=None
-  __llm_tokenizer__:t.Optional[T]=None
-  __llm_adapter_map__:t.Optional[ResolvedAdapterMap]=None
-  __llm_trust_remote_code__:bool=False
+  __llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
+  __llm_torch_dtype__: 'torch.dtype' = None
+  __llm_config__: t.Optional[LLMConfig] = None
+  __llm_backend__: LiteralBackend = None
+  __llm_quantization_config__: t.Optional[
+    t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
+  ] = None
+  __llm_runner__: t.Optional[Runner[M, T]] = None
+  __llm_model__: t.Optional[M] = None
+  __llm_tokenizer__: t.Optional[T] = None
+  __llm_adapter_map__: t.Optional[ResolvedAdapterMap] = None
+  __llm_trust_remote_code__: bool = False

  def __init__(
    self,
@@ -188,26 +188,34 @@ class LLM(t.Generic[M, T], ReprMixin):
    _eager=True,
    **attrs,
  ):
-    torch_dtype=attrs.pop('torch_dtype',None)  # backward compatible
-    if torch_dtype is not None:warnings.warns('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',DeprecationWarning,stacklevel=3);dtype=torch_dtype
+    torch_dtype = attrs.pop('torch_dtype', None)  # backward compatible
+    if torch_dtype is not None:
+      warnings.warns(
+        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
+        DeprecationWarning,
+        stacklevel=3,
+      )
+      dtype = torch_dtype
    _local = False
-    if validate_is_path(model_id):model_id,_local=resolve_filepath(model_id),True
-    backend=first_not_none(getenv('backend',default=backend),default=self._cascade_backend())
-    dtype=first_not_none(getenv('dtype',default=dtype,var=['TORCH_DTYPE']),default='auto')
-    quantize=first_not_none(getenv('quantize',default=quantize,var=['QUANITSE']),default=None)
-    attrs.update({'low_cpu_mem_usage':low_cpu_mem_usage})
+    if validate_is_path(model_id):
+      model_id, _local = resolve_filepath(model_id), True
+    backend = first_not_none(getenv('backend', default=backend), default=self._cascade_backend())
+    dtype = first_not_none(getenv('dtype', default=dtype, var=['TORCH_DTYPE']), default='auto')
+    quantize = first_not_none(getenv('quantize', default=quantize, var=['QUANITSE']), default=None)
+    attrs.update({'low_cpu_mem_usage': low_cpu_mem_usage})
    # parsing tokenizer and model kwargs, as the hierarchy is param pass > default
-    model_attrs,tokenizer_attrs=flatten_attrs(**attrs)
+    model_attrs, tokenizer_attrs = flatten_attrs(**attrs)
    if model_tag is None:
-      model_tag,model_version=self._make_tag_components(model_id,model_version,backend=backend)
-      if model_version:model_tag=f'{model_tag}:{model_version}'
+      model_tag, model_version = self._make_tag_components(model_id, model_version, backend=backend)
+      if model_version:
+        model_tag = f'{model_tag}:{model_version}'

    self.__attrs_init__(
      model_id=model_id,
      revision=model_version,
      tag=bentoml.Tag.from_taglike(model_tag),
      quantization_config=quantization_config,
-      quantise=getattr(self._Quantise,backend)(self,quantize),
+      quantise=getattr(self._Quantise, backend)(self, quantize),
      model_decls=args,
      adapter_map=convert_peft_config_type(adapter_map) if adapter_map is not None else None,
      serialisation=serialisation,
@@ -220,143 +228,248 @@ class LLM(t.Generic[M, T], ReprMixin):
      llm_config__=llm_config,
      llm_trust_remote_code__=trust_remote_code,
    )
-
    if _eager:
      try:
-        model=bentoml.models.get(self.tag)
+        model = bentoml.models.get(self.tag)
      except bentoml.exceptions.NotFound:
-        model=openllm.serialisation.import_model(self,trust_remote_code=self.trust_remote_code)
+        model = openllm.serialisation.import_model(self, trust_remote_code=self.trust_remote_code)
      # resolve the tag
-      self._tag=model.tag
-    if not _eager and embedded:raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
-    if embedded:logger.warning('Models will be loaded into memory. NOT RECOMMENDED in production and SHOULD ONLY used for development.');self.runner.init_local(quiet=True)
+      self._tag = model.tag
+    if not _eager and embedded:
+      raise RuntimeError("Embedded mode is not supported when '_eager' is False.")
+    if embedded:
+      logger.warning(
+        'NOT RECOMMENDED in production and SHOULD ONLY used for development (Loading into current memory).'
+      )
+      self.runner.init_local(quiet=True)
+
  class _Quantise:
    @staticmethod
-    def pt(llm:LLM,quantise=None):return quantise
-    @staticmethod
-    def vllm(llm:LLM,quantise=None):return quantise
-    @staticmethod
-    def ctranslate(llm:LLM,quantise=None):
-      if quantise in {'int4','awq','gptq','squeezellm'}:raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
-      if quantise=='int8':quantise='int8_float16' if llm._has_gpus else 'int8_float32'
+    def pt(llm: LLM, quantise=None):
      return quantise
-  @apply(lambda val:tuple(str.lower(i) if i else i for i in val))
-  def _make_tag_components(self,model_id:str,model_version:str|None,backend:str)->tuple[str,str|None]:
-    model_id,*maybe_revision=model_id.rsplit(':')
-    if len(maybe_revision)>0:
-      if model_version is not None:logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.",maybe_revision[0],model_version)
+
+    @staticmethod
+    def vllm(llm: LLM, quantise=None):
+      return quantise
+
+    @staticmethod
+    def ctranslate(llm: LLM, quantise=None):
+      if quantise in {'int4', 'awq', 'gptq', 'squeezellm'}:
+        raise ValueError(f"Quantisation '{quantise}' is not supported for backend 'ctranslate'")
+      if quantise == 'int8':
+        quantise = 'int8_float16' if llm._has_gpus else 'int8_float32'
+      return quantise
+
+  @apply(lambda val: tuple(str.lower(i) if i else i for i in val))
+  def _make_tag_components(self, model_id: str, model_version: str | None, backend: str) -> tuple[str, str | None]:
+    model_id, *maybe_revision = model_id.rsplit(':')
+    if len(maybe_revision) > 0:
+      if model_version is not None:
+        logger.warning(
+          "revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
+        )
      model_version = maybe_revision[0]
-    if validate_is_path(model_id):model_id,model_version=resolve_filepath(model_id),first_not_none(model_version,default=generate_hash_from_file(model_id))
-    return f'{backend}-{normalise_model_name(model_id)}',model_version
+    if validate_is_path(model_id):
+      model_id, model_version = (
+        resolve_filepath(model_id),
+        first_not_none(model_version, default=generate_hash_from_file(model_id)),
+      )
+    return f'{backend}-{normalise_model_name(model_id)}', model_version
+
  @functools.cached_property
  def _has_gpus(self):
    try:
      from cuda import cuda
-      err,*_=cuda.cuInit(0)
-      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to initialise CUDA runtime binding.')
-      err,num_gpus=cuda.cuDeviceGetCount()
-      if err!=cuda.CUresult.CUDA_SUCCESS:raise RuntimeError('Failed to get CUDA device count.')
+
+      err, *_ = cuda.cuInit(0)
+      if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      err, num_gpus = cuda.cuDeviceGetCount()
+      if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Failed to get CUDA device count.')
      return True
-    except (ImportError, RuntimeError):return False
+    except (ImportError, RuntimeError):
+      return False
+
  @property
  def _torch_dtype(self):
    import torch, transformers
-    _map=_torch_dtype_mapping()
-    if not isinstance(self.__llm_torch_dtype__,torch.dtype):
-      try:hf_config=transformers.AutoConfig.from_pretrained(self.bentomodel.path,trust_remote_code=self.trust_remote_code)
-      except OpenLLMException:hf_config=transformers.AutoConfig.from_pretrained(self.model_id,trust_remote_code=self.trust_remote_code)
-      config_dtype=getattr(hf_config,'torch_dtype',None)
-      if config_dtype is None:config_dtype=torch.float32
-      if self.__llm_dtype__=='auto':
-        if config_dtype==torch.float32:torch_dtype=torch.float16
-        else:torch_dtype=config_dtype
+
+    _map = _torch_dtype_mapping()
+    if not isinstance(self.__llm_torch_dtype__, torch.dtype):
+      try:
+        hf_config = transformers.AutoConfig.from_pretrained(
+          self.bentomodel.path, trust_remote_code=self.trust_remote_code
+        )
+      except OpenLLMException:
+        hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
+      config_dtype = getattr(hf_config, 'torch_dtype', None)
+      if config_dtype is None:
+        config_dtype = torch.float32
+      if self.__llm_dtype__ == 'auto':
+        if config_dtype == torch.float32:
+          torch_dtype = torch.float16
+        else:
+          torch_dtype = config_dtype
      else:
-        if self.__llm_dtype__ not in _map:raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
-        torch_dtype=_map[self.__llm_dtype__]
-      self.__llm_torch_dtype__=torch_dtype
+        if self.__llm_dtype__ not in _map:
+          raise ValueError(f"Unknown dtype '{self.__llm_dtype__}'")
+        torch_dtype = _map[self.__llm_dtype__]
+      self.__llm_torch_dtype__ = torch_dtype
    return self.__llm_torch_dtype__
+
  @property
-  def _model_attrs(self):return {**self.import_kwargs[0],**self.__model_attrs}
+  def _model_attrs(self):
+    return {**self.import_kwargs[0], **self.__model_attrs}
+
  @_model_attrs.setter
-  def _model_attrs(self, value):self.__model_attrs = value
+  def _model_attrs(self, value):
+    self.__model_attrs = value
+
  @property
-  def _tokenizer_attrs(self):return {**self.import_kwargs[1],**self.__tokenizer_attrs}
-  def _cascade_backend(self)->LiteralBackend:
+  def _tokenizer_attrs(self):
+    return {**self.import_kwargs[1], **self.__tokenizer_attrs}
+
+  def _cascade_backend(self) -> LiteralBackend:
    if self._has_gpus:
-      if is_vllm_available():return 'vllm'
-      elif is_ctranslate_available():return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
-    elif is_ctranslate_available():return 'ctranslate'
-    else:return 'pt'
-  def __setattr__(self,attr,value):
-    if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
+      if is_vllm_available():
+        return 'vllm'
+      elif is_ctranslate_available():
+        return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
+    elif is_ctranslate_available():
+      return 'ctranslate'
+    else:
+      return 'pt'
+
+  def __setattr__(self, attr, value):
+    if attr in {'model', 'tokenizer', 'runner', 'import_kwargs'}:
+      raise ForbiddenAttributeError(f'{attr} should not be set during runtime.')
    super().__setattr__(attr, value)
-  def __del__(self):del self.__llm_model__,self.__llm_tokenizer__,self.__llm_adapter_map__
+
+  def __del__(self):
+    try:
+      del self.__llm_model__, self.__llm_tokenizer__, self.__llm_adapter_map__
+    except AttributeError:
+      pass
+
  @property
-  def __repr_keys__(self):return {'model_id','revision','backend','type'}
+  def __repr_keys__(self):
+    return {'model_id', 'revision', 'backend', 'type'}
+
  def __repr_args__(self):
-    yield 'model_id',self._model_id if not self._local else self.tag.name
-    yield 'revision',self._revision if self._revision else self.tag.version
-    yield 'backend',self.__llm_backend__
-    yield 'type',self.llm_type
+    yield 'model_id', self._model_id if not self._local else self.tag.name
+    yield 'revision', self._revision if self._revision else self.tag.version
+    yield 'backend', self.__llm_backend__
+    yield 'type', self.llm_type
+
  @property
-  def import_kwargs(self):return {'device_map':'auto' if self._has_gpus else None,'torch_dtype':self._torch_dtype},{'padding_side':'left','truncation_side':'left'}
+  def import_kwargs(self):
+    return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
+      'padding_side': 'left',
+      'truncation_side': 'left',
+    }
+
  @property
  def trust_remote_code(self):
-    env=os.getenv('TRUST_REMOTE_CODE')
-    if env is not None:return str(env).upper() in ENV_VARS_TRUE_VALUES
+    env = os.getenv('TRUST_REMOTE_CODE')
+    if env is not None:
+      check_bool_env('TRUST_REMOTE_CODE', env)
    return self.__llm_trust_remote_code__
+
  @property
-  def model_id(self):return self._model_id
+  def model_id(self):
+    return self._model_id
+
  @property
-  def revision(self):return self._revision
+  def revision(self):
+    return self._revision
+
  @property
-  def tag(self):return self._tag
+  def tag(self):
+    return self._tag
+
  @property
-  def bentomodel(self):return openllm.serialisation.get(self)
+  def bentomodel(self):
+    return openllm.serialisation.get(self)
+
  @property
  def quantization_config(self):
    if self.__llm_quantization_config__ is None:
      from ._quantisation import infer_quantisation_config
-      if self._quantization_config is not None:self.__llm_quantization_config__ = self._quantization_config
-      elif self._quantise is not None:self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self,self._quantise,**self._model_attrs)
-      else:raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
+
+      if self._quantization_config is not None:
+        self.__llm_quantization_config__ = self._quantization_config
+      elif self._quantise is not None:
+        self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
+          self, self._quantise, **self._model_attrs
+        )
+      else:
+        raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
    return self.__llm_quantization_config__
+
  @property
-  def has_adapters(self):return self._adapter_map is not None
+  def has_adapters(self):
+    return self._adapter_map is not None
+
  @property
-  def local(self):return self._local
+  def local(self):
+    return self._local
+
  @property
-  def quantise(self):return self._quantise
+  def quantise(self):
+    return self._quantise
+
  @property
-  def llm_type(self):return normalise_model_name(self._model_id)
+  def llm_type(self):
+    return normalise_model_name(self._model_id)
+
  @property
-  def llm_parameters(self):return (self._model_decls,self._model_attrs),self._tokenizer_attrs
+  def llm_parameters(self):
+    return (self._model_decls, self._model_attrs), self._tokenizer_attrs
+
  @property
-  def identifying_params(self):return {'configuration':self.config.model_dump_json().decode(),'model_ids':orjson.dumps(self.config['model_ids']).decode(),'model_id':self.model_id}
+  def identifying_params(self):
+    return {
+      'configuration': self.config.model_dump_json().decode(),
+      'model_ids': orjson.dumps(self.config['model_ids']).decode(),
+      'model_id': self.model_id,
+    }
+
  @property
  def tokenizer(self):
-    if self.__llm_tokenizer__ is None:self.__llm_tokenizer__=openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
+    if self.__llm_tokenizer__ is None:
+      self.__llm_tokenizer__ = openllm.serialisation.load_tokenizer(self, **self.llm_parameters[-1])
    return self.__llm_tokenizer__
+
  @property
  def runner(self):
    from ._runners import runner
-    if self.__llm_runner__ is None:self.__llm_runner__=runner(self)
+
+    if self.__llm_runner__ is None:
+      self.__llm_runner__ = runner(self)
    return self.__llm_runner__
-  def prepare(self,adapter_type='lora',use_gradient_checking=True,**attrs):
-    if self.__llm_backend__!='pt':raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
+
+  def prepare(self, adapter_type='lora', use_gradient_checking=True, **attrs):
+    if self.__llm_backend__ != 'pt':
+      raise RuntimeError('Fine tuning is only supported for PyTorch backend.')
    from peft.mapping import get_peft_model
    from peft.utils.other import prepare_model_for_kbit_training
-    model=get_peft_model(
-      prepare_model_for_kbit_training(self.model,use_gradient_checkpointing=use_gradient_checking),
+
+    model = get_peft_model(
+      prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
      self.config['fine_tune_strategies']
-      .get(adapter_type,self.config.make_fine_tune_config(adapter_type))
+      .get(adapter_type, self.config.make_fine_tune_config(adapter_type))
      .train()
      .with_config(**attrs)
      .build(),
    )
-    if DEBUG:model.print_trainable_parameters()
-    return model,self.tokenizer
-  def prepare_for_training(self,*args,**attrs):logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Please use `prepare` instead.');return self.prepare(*args,**attrs)
+    if DEBUG:
+      model.print_trainable_parameters()
+    return model, self.tokenizer
+
+  def prepare_for_training(self, *args, **attrs):
+    logger.warning('`prepare_for_training` is deprecated and will be removed in the future. Use `prepare` instead.')
+    return self.prepare(*args, **attrs)

  @property
  def adapter_map(self):
@@ -431,33 +544,49 @@ class LLM(t.Generic[M, T], ReprMixin):
    return self.__llm_config__


-# fmt: off
@functools.lru_cache(maxsize=1)
-def _torch_dtype_mapping()->dict[str,torch.dtype]:
-  import torch; return {
+def _torch_dtype_mapping() -> dict[str, torch.dtype]:
+  import torch
+
+  return {
    'half': torch.float16,
-    'float': torch.float32,
    'float16': torch.float16,
+    'float': torch.float32,
    'float32': torch.float32,
    'bfloat16': torch.bfloat16,
  }
-def normalise_model_name(name:str)->str:return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/','--'))
-def convert_peft_config_type(adapter_map:dict[str, str])->AdapterMap:
-  if not is_peft_available():raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
+
+
+def normalise_model_name(name: str) -> str:
+  return (
+    os.path.basename(resolve_filepath(name))
+    if validate_is_path(name)
+    else inflection.dasherize(name.replace('/', '--'))
+  )
+
+
+def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
+  if not is_peft_available():
+    raise RuntimeError(
+      "LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
+    )
  from huggingface_hub import hf_hub_download

-  resolved:AdapterMap={}
+  resolved: AdapterMap = {}
  for path_or_adapter_id, name in adapter_map.items():
-    if name is None:raise ValueError('Adapter name must be specified.')
+    if name is None:
+      raise ValueError('Adapter name must be specified.')
    if os.path.isfile(os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)):
-      config_file=os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
+      config_file = os.path.join(path_or_adapter_id, PEFT_CONFIG_NAME)
    else:
      try:
-        config_file=hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
+        config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
      except Exception as err:
        raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
-    with open(config_file, 'r') as file:resolved_config=orjson.loads(file.read())
-    _peft_type=resolved_config['peft_type'].lower()
-    if _peft_type not in resolved:resolved[_peft_type]=()
-    resolved[_peft_type]+=(_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
+    with open(config_file, 'r') as file:
+      resolved_config = orjson.loads(file.read())
+    _peft_type = resolved_config['peft_type'].lower()
+    if _peft_type not in resolved:
+      resolved[_peft_type] = ()
+    resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
  return resolved
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,2 +1,9 @@
-# fmt: off
-import os,orjson,openllm_core.utils as coreutils;model_id,model_tag,adapter_map,serialization,trust_remote_code=os.environ['OPENLLM_MODEL_ID'],None,orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP',orjson.dumps(None))),os.getenv('OPENLLM_SERIALIZATION',default='safetensors'),coreutils.check_bool_env('TRUST_REMOTE_CODE',False)
+import os, orjson, openllm_core.utils as coreutils
+
+model_id, model_tag, adapter_map, serialization, trust_remote_code = (
+  os.environ['OPENLLM_MODEL_ID'],
+  None,
+  orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
+  os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
+  coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
+)
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -215,18 +215,18 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
 NvidiaGpuResource = _make_resource_class(
  'NvidiaGpuResource',
  'nvidia.com/gpu',
-  """NVIDIA GPU resource.
+  '''NVIDIA GPU resource.

    This is a modified version of internal's BentoML's NvidiaGpuResource
-    where it respects and parse CUDA_VISIBLE_DEVICES correctly.""",
+    where it respects and parse CUDA_VISIBLE_DEVICES correctly.''',
 )
 AmdGpuResource = _make_resource_class(
  'AmdGpuResource',
  'amd.com/gpu',
-  """AMD GPU resource.
+  '''AMD GPU resource.

    Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
-    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""",
+    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''',
 )


--- a/openllm-python/src/openllm/_strategies.pyi
+++ b/openllm-python/src/openllm/_strategies.pyi
@@ -19,10 +19,10 @@ class CascadingResourceStrategy:
    resource_request: Optional[Dict[str, Any]],
    workers_per_resource: float,
  ) -> int:
-    """Return the number of workers to be used for the given runnable class.
+    '''Return the number of workers to be used for the given runnable class.

    Note that for all available GPU, the number of workers will always be 1.
-    """
+    '''
  @classmethod
  def get_worker_env(
    cls,
@@ -31,16 +31,16 @@ class CascadingResourceStrategy:
    workers_per_resource: Union[int, float],
    worker_index: int,
  ) -> Dict[str, Any]:
-    """Get worker env for this given worker_index.
+    '''Get worker env for this given worker_index.

    Args:
      runnable_class: The runnable class to be run.
      resource_request: The resource request of the runnable.
      workers_per_resource: # of workers per resource.
      worker_index: The index of the worker, start from 0.
-    """
+    '''
  @staticmethod
  def transpile_workers_to_cuda_envvar(
    workers_per_resource: Union[float, int], gpus: List[str], worker_index: int
  ) -> str:
-    """Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string."""
+    '''Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.'''
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -22,7 +22,7 @@ OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'


 def build_editable(path, package='openllm'):
-  """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
+  '''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.'''
  if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
    return None
  # We need to build the package in editable mode, so that we can import it
--- a/openllm-python/src/openllm/bundle/oci/Dockerfile
+++ b/openllm-python/src/openllm/bundle/oci/Dockerfile
@@ -1,12 +1,10 @@
 # syntax=docker/dockerfile-upstream:master
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM python:3.9-slim-bullseye as base-container
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base-container

 # Automatically set by buildx
 ARG TARGETPLATFORM

-ENV PATH /opt/conda/bin:$PATH
-
 ENV DEBIAN_FRONTEND=noninteractive

 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
@@ -15,23 +13,32 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
  ccache \
  curl \
  libssl-dev ca-certificates make \
-  git && \
+  git python3-pip && \
  rm -rf /var/lib/apt/lists/*

+RUN mkdir -p /openllm-python
+RUN mkdir -p /openllm-core
+RUN mkdir -p /openllm-client

 # Install required dependencies
-COPY openllm-python/src src
-COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml ./
+COPY openllm-python/src /openllm-python/src
+COPY hatch.toml README.md CHANGELOG.md openllm-python/pyproject.toml /openllm-python/

 # Install all required dependencies
 # We have to install autoawq first to avoid conflict with torch, then reinstall torch with vllm
 # below
-# pip install autoawq --no-cache-dir && \
 RUN --mount=type=cache,target=/root/.cache/pip \
-  pip install --extra-index-url "https://huggingface.github.io/autogptq-index/whl/cu118/" \
-  -v --no-cache-dir \
+  pip3 install -v --no-cache-dir \
  "ray==2.6.0" "vllm==0.2.2" xformers && \
-  pip install --no-cache-dir -e .
+  pip3 install --no-cache-dir -e .
+
+COPY openllm-core/src openllm-core/src
+COPY hatch.toml README.md CHANGELOG.md openllm-core/pyproject.toml /openllm-core/
+RUN --mount=type=cache,target=/root/.cache/pip pip3 install -v --no-cache-dir -e /openllm-core/
+
+COPY openllm-client/src openllm-client/src
+COPY hatch.toml README.md CHANGELOG.md openllm-client/pyproject.toml /openllm-client/
+RUN --mount=type=cache,target=/root/.cache/pip pip3 install -v --no-cache-dir -e /openllm-client/

 FROM base-container

--- a/openllm-python/src/openllm/bundle/oci/init.py
+++ b/openllm-python/src/openllm/bundle/oci/init.py
@@ -50,13 +50,17 @@ class RefResolver:
    else:
      raise ValueError(f'Unknown strategy: {strategy_or_version}')

-  # fmt: off
  @property
-  def tag(self):return 'latest' if self.strategy in {'latest','nightly'} else repr(self.version)
+  def tag(self):
+    return 'latest' if self.strategy in {'latest', 'nightly'} else repr(self.version)
+
  @staticmethod
-  def construct_base_image(reg,strategy=None):
-    if reg == 'gh': logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
-    elif reg == 'docker': logger.warning('docker is base image is yet to be supported. Falling back to "ecr".'); reg = 'ecr'
+  def construct_base_image(reg, strategy=None):
+    if reg == 'gh':
+      logger.warning("Setting base registry to 'gh' will affect cold start performance on GCP/AWS.")
+    elif reg == 'docker':
+      logger.warning('docker is base image is yet to be supported. Falling back to "ecr".')
+      reg = 'ecr'
    return f'{_CONTAINER_REGISTRY[reg]}:{RefResolver.from_strategy(strategy).tag}'


--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -1,3 +1,10 @@
-# fmt: off
-def __dir__():import openllm_client as _client;return sorted(dir(_client))
-def __getattr__(it):import openllm_client as _client;return getattr(_client, it)
+def __dir__():
+  import openllm_client as _client
+
+  return sorted(dir(_client))
+
+
+def __getattr__(it):
+  import openllm_client as _client
+
+  return getattr(_client, it)
--- a/openllm-python/src/openllm/client.pyi
+++ b/openllm-python/src/openllm/client.pyi
@@ -1,9 +1,9 @@
-"""OpenLLM Python client.
+'''OpenLLM Python client.

 ```python
 client = openllm.client.HTTPClient("http://localhost:8080")
 client.query("What is the difference between gather and scatter?")
 ```
-"""
+'''

 from openllm_client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient
--- a/openllm-python/src/openllm/entrypoints/init.pyi
+++ b/openllm-python/src/openllm/entrypoints/init.pyi
@@ -1,11 +1,11 @@
-"""Entrypoint for all third-party apps.
+'''Entrypoint for all third-party apps.

 Currently support OpenAI, Cohere compatible API.

 Each module should implement the following API:

 - `mount_to_svc(svc: bentoml.Service, llm: openllm.LLM[M, T]) -> bentoml.Service: ...`
-"""
+'''

 from bentoml import Service
 from openllm_core._typing_compat import M, T
--- a/openllm-python/src/openllm/entrypoints/_openapi.py
+++ b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -11,7 +11,7 @@ from openllm_core.utils import first_not_none

 OPENAPI_VERSION, API_VERSION = '3.0.2', '1.0'
 # NOTE: OpenAI schema
-LIST_MODELS_SCHEMA = """\
+LIST_MODELS_SCHEMA = '''\
 ---
 consumes:
 - application/json
@@ -41,8 +41,8 @@ responses:
              owned_by: 'na'
        schema:
          $ref: '#/components/schemas/ModelList'
-"""
-CHAT_COMPLETIONS_SCHEMA = """\
+'''
+CHAT_COMPLETIONS_SCHEMA = '''\
 ---
 consumes:
 - application/json
@@ -179,8 +179,8 @@ responses:
                }
              }
    description: Bad Request
-"""
-COMPLETIONS_SCHEMA = """\
+'''
+COMPLETIONS_SCHEMA = '''\
 ---
 consumes:
  - application/json
@@ -332,8 +332,8 @@ responses:
                }
              }
    description: Bad Request
-"""
-HF_AGENT_SCHEMA = """\
+'''
+HF_AGENT_SCHEMA = '''\
 ---
 consumes:
  - application/json
@@ -377,8 +377,8 @@ responses:
        schema:
          $ref: '#/components/schemas/HFErrorResponse'
    description: Not Found
-"""
-HF_ADAPTERS_SCHEMA = """\
+'''
+HF_ADAPTERS_SCHEMA = '''\
 ---
 consumes:
 - application/json
@@ -408,8 +408,8 @@ responses:
        schema:
          $ref: '#/components/schemas/HFErrorResponse'
    description: Not Found
-"""
-COHERE_GENERATE_SCHEMA = """\
+'''
+COHERE_GENERATE_SCHEMA = '''\
 ---
 consumes:
  - application/json
@@ -453,8 +453,8 @@ requestBody:
            stop_sequences:
              - "\\n"
              - "<|endoftext|>"
-"""
-COHERE_CHAT_SCHEMA = """\
+'''
+COHERE_CHAT_SCHEMA = '''\
 ---
 consumes:
 - application/json
@@ -467,7 +467,7 @@ tags:
  - Cohere
 x-bentoml-name: cohere_chat
 summary: Creates a model response for the given chat conversation.
-"""
+'''

 _SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}

--- a/openllm-python/src/openllm/serialisation/init.py
+++ b/openllm-python/src/openllm/serialisation/init.py
@@ -14,11 +14,11 @@ P = ParamSpec('P')


 def load_tokenizer(llm: LLM[M, T], **tokenizer_attrs: t.Any) -> TypeGuard[T]:
-  """Load the tokenizer from BentoML store.
+  '''Load the tokenizer from BentoML store.

  By default, it will try to find the bentomodel whether it is in store..
  If model is not found, it will raises a ``bentoml.exceptions.NotFound``.
-  """
+  '''
  import cloudpickle
  import fs
  from transformers import AutoTokenizer
--- a/openllm-python/src/openllm/serialisation/init.pyi
+++ b/openllm-python/src/openllm/serialisation/init.pyi
@@ -1,9 +1,9 @@
-"""Serialisation utilities for OpenLLM.
+'''Serialisation utilities for OpenLLM.

 Currently supports transformers for PyTorch, and vLLM.

 Currently, GGML format is working in progress.
-"""
+'''

 from typing import Any

--- a/openllm-python/src/openllm/utils.py
+++ b/openllm-python/src/openllm/utils.py
@@ -1,7 +1,8 @@
-# fmt: off
 import functools, importlib.metadata, openllm_core

 __all__ = ['generate_labels', 'available_devices', 'device_count']
+
+
 def generate_labels(llm):
  return {
    'backend': llm.__llm_backend__,
@@ -11,10 +12,25 @@ def generate_labels(llm):
    'serialisation': llm._serialisation,
    **{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
  }
-def available_devices():from ._strategies import NvidiaGpuResource;return tuple(NvidiaGpuResource.from_system())
+
+
+def available_devices():
+  from ._strategies import NvidiaGpuResource
+
+  return tuple(NvidiaGpuResource.from_system())
+
+
@functools.lru_cache(maxsize=1)
-def device_count()->int:return len(available_devices())
-def __dir__():coreutils=set(dir(openllm_core.utils))|set([it for it in openllm_core.utils._extras if not it.startswith('_')]);return sorted(__all__)+sorted(list(coreutils))
+def device_count() -> int:
+  return len(available_devices())
+
+
+def __dir__():
+  coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
+  return sorted(__all__) + sorted(list(coreutils))
+
+
 def __getattr__(it):
-  if hasattr(openllm_core.utils, it):return getattr(openllm_core.utils, it)
+  if hasattr(openllm_core.utils, it):
+    return getattr(openllm_core.utils, it)
  raise AttributeError(f'module {__name__} has no attribute {it}')
--- a/openllm-python/src/openllm_cli/init.py
+++ b/openllm-python/src/openllm_cli/init.py
@@ -1,4 +1,4 @@
-"""OpenLLM CLI.
+'''OpenLLM CLI.

 For more information see ``openllm -h``.
-"""
+'''
--- a/openllm-python/src/openllm_cli/_factory.py
+++ b/openllm-python/src/openllm_cli/_factory.py
@@ -146,7 +146,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC
      backend_option(factory=cog.optgroup),
      cog.optgroup.group(
        'LLM Optimization Options',
-        help="""Optimization related options.
+        help='''Optimization related options.

            OpenLLM supports running model k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM.

@@ -154,7 +154,7 @@ def start_decorator(serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC

            - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
            - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
-            """,
+            ''',
      ),
      quantize_option(factory=cog.optgroup),
      serialisation_option(factory=cog.optgroup),
@@ -196,7 +196,7 @@ _IGNORED_OPTIONS = {'working_dir', 'production', 'protocol_version'}


 def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]:
-  """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`."""
+  '''Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.'''
  from bentoml_cli.cli import cli

  command = 'serve' if not serve_grpc else 'serve-grpc'
@@ -233,11 +233,11 @@ _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args


 def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]:
-  """General ``@click`` decorator with some sauce.
+  '''General ``@click`` decorator with some sauce.

  This decorator extends the default ``@click.option`` plus a factory option and factory attr to
  provide type-safe click.option or click.argument wrapper for all compatible factory.
-  """
+  '''
  factory = attrs.pop('factory', click)
  factory_attr = attrs.pop('attr', 'option')
  if factory_attr != 'argument':
@@ -346,7 +346,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
    default=None,
    envvar='OPENLLM_QUANTIZE',
    show_envvar=True,
-    help="""Dynamic quantization for running this LLM.
+    help='''Dynamic quantization for running this LLM.

      The following quantization strategies are supported:

@@ -361,15 +361,15 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att
      - ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantization](https://arxiv.org/abs/2306.07629)

      > [!NOTE] that the model can also be served with quantized weights.
-      """
+      '''
    + (
-      """
-      > [!NOTE] that this will set the mode for serving within deployment."""
+      '''
+      > [!NOTE] that this will set the mode for serving within deployment.'''
      if build
      else ''
    )
-    + """
-      > [!NOTE] that quantization are currently only available in *PyTorch* models.""",
+    + '''
+      > [!NOTE] that quantization are currently only available in *PyTorch* models.''',
    **attrs,
  )(f)

@@ -383,7 +383,7 @@ def workers_per_resource_option(
    callback=workers_per_resource_callback,
    type=str,
    required=False,
-    help="""Number of workers per resource assigned.
+    help='''Number of workers per resource assigned.

      See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy
      for more information. By default, this is set to 1.
@@ -393,7 +393,7 @@ def workers_per_resource_option(
      - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models.

      - ``conserved``: This will determine the number of available GPU resources. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``.
-      """
+      '''
    + (
      """\n
      > [!NOTE] The workers value passed into 'build' will determine how the LLM can
@@ -416,7 +416,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
    show_default=True,
    show_envvar=True,
    envvar='OPENLLM_SERIALIZATION',
-    help="""Serialisation format for save/load LLM.
+    help='''Serialisation format for save/load LLM.

      Currently the following strategies are supported:

@@ -425,7 +425,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
      > [!NOTE] Safetensors might not work for every cases, and you can always fallback to ``legacy`` if needed.

      - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
-      """,
+      ''',
    **attrs,
  )(f)

--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -291,7 +291,7 @@ def _import_model(


 def _list_models() -> dict[str, t.Any]:
-  """List all available models within the local store."""
+  '''List all available models within the local store.'''
  from .entrypoint import models_command

  return models_command.main(args=['--quiet'], standalone_mode=False)
--- a/openllm-python/src/openllm_cli/entrypoint.py
+++ b/openllm-python/src/openllm_cli/entrypoint.py
@@ -94,14 +94,14 @@ else:

 P = ParamSpec('P')
 logger = logging.getLogger('openllm')
-OPENLLM_FIGLET = """\
+OPENLLM_FIGLET = '''\
 ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
 ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
 ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
 ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
 ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
 ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝
-"""
+'''

 ServeCommand = t.Literal['serve', 'serve-grpc']

@@ -287,7 +287,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup):
    return decorator

  def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
-    """Additional format methods that include extensions as well as the default cli command."""
+    '''Additional format methods that include extensions as well as the default cli command.'''
    from gettext import gettext as _

    commands: list[tuple[str, click.Command]] = []
@@ -334,7 +334,7 @@ _PACKAGE_NAME = 'openllm'
  message=f'{_PACKAGE_NAME}, %(version)s (compiled: {openllm.COMPILED})\nPython ({platform.python_implementation()}) {platform.python_version()}',
 )
 def cli() -> None:
-  """\b
+  '''\b
   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
  ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
  ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
@@ -345,7 +345,7 @@ def cli() -> None:
  \b
  An open platform for operating large language models in production.
  Fine-tune, serve, deploy, and monitor any LLMs with ease.
-  """
+  '''


@cli.command(
@@ -389,13 +389,13 @@ def start_command(
  max_model_len: int | None,
  **attrs: t.Any,
 ) -> LLMConfig | subprocess.Popen[bytes]:
-  """Start any LLM as a REST server.
+  '''Start any LLM as a REST server.

  \b
  ```bash
  $ openllm <start|start-http> <model_id> --<options> ...
  ```
-  """
+  '''
  if model_id in openllm.CONFIG_MAPPING:
    _model_name = model_id
    if deprecated_model_id is not None:
@@ -519,13 +519,13 @@ def start_grpc_command(
  max_model_len: int | None,
  **attrs: t.Any,
 ) -> LLMConfig | subprocess.Popen[bytes]:
-  """Start any LLM as a gRPC server.
+  '''Start any LLM as a gRPC server.

  \b
  ```bash
  $ openllm start-grpc <model_id> --<options> ...
  ```
-  """
+  '''
  termui.warning(
    'Continuous batching is currently not yet supported with gPRC. If you want to use continuous batching with gRPC, feel free to open a GitHub issue about your usecase.\n'
  )
@@ -955,7 +955,7 @@ def build_command(
  force_push: bool,
  **_: t.Any,
 ) -> BuildBentoOutput:
-  """Package a given models into a BentoLLM.
+  '''Package a given models into a BentoLLM.

  \b
  ```bash
@@ -971,7 +971,7 @@ def build_command(
  > [!IMPORTANT]
  > To build the bento with compiled OpenLLM, make sure to prepend HATCH_BUILD_HOOKS_ENABLE=1. Make sure that the deployment
  > target also use the same Python version and architecture as build machine.
-  """
+  '''
  from openllm.serialisation.transformers.weights import has_safetensors_weights

  if model_id in openllm.CONFIG_MAPPING:
@@ -1167,13 +1167,13 @@ class ModelItem(t.TypedDict):
@cli.command()
@click.option('--show-available', is_flag=True, default=True, hidden=True)
 def models_command(**_: t.Any) -> dict[t.LiteralString, ModelItem]:
-  """List all supported models.
+  '''List all supported models.

  \b
  ```bash
  openllm models
  ```
-  """
+  '''
  result: dict[t.LiteralString, ModelItem] = {
    m: ModelItem(
      architecture=config.__openllm_architecture__,
@@ -1216,11 +1216,11 @@ def prune_command(
  bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
  **_: t.Any,
 ) -> None:
-  """Remove all saved models, and bentos built with OpenLLM locally.
+  '''Remove all saved models, and bentos built with OpenLLM locally.

  \b
  If a model type is passed, then only prune models for that given model type.
-  """
+  '''
  available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [
    (m, model_store)
    for m in bentoml.models.list()
@@ -1326,13 +1326,13 @@ def query_command(
  _memoized: DictStrAny,
  **_: t.Any,
 ) -> None:
-  """Query a LLM interactively, from a terminal.
+  '''Query a LLM interactively, from a terminal.

  \b
  ```bash
  $ openllm query --endpoint http://12.323.2.1:3000 "What is the meaning of life?"
  ```
-  """
+  '''
  if server_type == 'grpc':
    raise click.ClickException("'grpc' is currently disabled.")
  _memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
@@ -1353,7 +1353,7 @@ def query_command(

@cli.group(cls=Extensions, hidden=True, name='extension')
 def extension_command() -> None:
-  """Extension for OpenLLM CLI."""
+  '''Extension for OpenLLM CLI.'''


 if __name__ == '__main__':
--- a/openllm-python/src/openllm_cli/extension/build_base_container.py
+++ b/openllm-python/src/openllm_cli/extension/build_base_container.py
@@ -71,7 +71,7 @@ def build_container(
@click.command(
  'build_base_container',
  context_settings=termui.CONTEXT_SETTINGS,
-  help="""Base image builder for BentoLLM.
+  help='''Base image builder for BentoLLM.

          By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
          Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
@@ -81,7 +81,7 @@ def build_container(
          This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.

          Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
-          """,
+          ''',
 )
@container_registry_option
@click.option(
--- a/openllm-python/src/openllm_cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm_cli/extension/dive_bentos.py
@@ -24,7 +24,7 @@ if t.TYPE_CHECKING:
 def cli(
  ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
 ) -> str | None:
-  """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
+  '''Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path).'''
  try:
    bentomodel = _bento_store.get(bento)
  except bentoml.exceptions.NotFound:
--- a/openllm-python/src/openllm_cli/extension/list_bentos.py
+++ b/openllm-python/src/openllm_cli/extension/list_bentos.py
@@ -13,7 +13,7 @@ from openllm_cli import termui
@click.command('list_bentos', context_settings=termui.CONTEXT_SETTINGS)
@click.pass_context
 def cli(ctx: click.Context) -> None:
-  """List available bentos built by OpenLLM."""
+  '''List available bentos built by OpenLLM.'''
  mapping = {
    k: [
      {
--- a/openllm-python/src/openllm_cli/extension/list_models.py
+++ b/openllm-python/src/openllm_cli/extension/list_models.py
@@ -18,7 +18,7 @@ if t.TYPE_CHECKING:
@click.command('list_models', context_settings=termui.CONTEXT_SETTINGS)
@model_name_argument(required=False, shell_complete=model_complete_envvar)
 def cli(model_name: str | None) -> DictStrAny:
-  """This is equivalent to openllm models --show-available less the nice table."""
+  '''This is equivalent to openllm models --show-available less the nice table.'''
  models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())
  ids_in_local_store = {
    k: [