chore(style): synchronized style across packages [skip ci]

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-05-24 00:24:41 -04:00 · 2023-08-23 08:46:22 -04:00
parent bbd9aa7646
commit 787ce1b3b6
124 changed files with 2775 additions and 2771 deletions
--- a/openllm-python/src/openllm/models/auto/init.py
+++ b/openllm-python/src/openllm/models/auto/init.py
@@ -4,10 +4,10 @@ import openllm
 from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
 from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
 _import_structure: dict[str, list[str]] = {
-    "modeling_auto": ["MODEL_MAPPING_NAMES"],
-    "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"],
-    "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"],
-    "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]
+    'modeling_auto': ['MODEL_MAPPING_NAMES'],
+    'modeling_flax_auto': ['MODEL_FLAX_MAPPING_NAMES'],
+    'modeling_tf_auto': ['MODEL_TF_MAPPING_NAMES'],
+    'modeling_vllm_auto': ['MODEL_VLLM_MAPPING_NAMES']
 }
 if t.TYPE_CHECKING:
  from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
@@ -19,31 +19,31 @@ try:
 except openllm.exceptions.MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_auto"].extend(["AutoLLM", "MODEL_MAPPING"])
+  _import_structure['modeling_auto'].extend(['AutoLLM', 'MODEL_MAPPING'])
  if t.TYPE_CHECKING: from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING, AutoLLM as AutoLLM
 try:
  if not is_vllm_available(): raise openllm.exceptions.MissingDependencyError
 except openllm.exceptions.MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_auto"].extend(["AutoVLLM", "MODEL_VLLM_MAPPING"])
+  _import_structure['modeling_vllm_auto'].extend(['AutoVLLM', 'MODEL_VLLM_MAPPING'])
  if t.TYPE_CHECKING: from .modeling_vllm_auto import MODEL_VLLM_MAPPING as MODEL_VLLM_MAPPING, AutoVLLM as AutoVLLM
 try:
  if not is_flax_available(): raise openllm.exceptions.MissingDependencyError
 except openllm.exceptions.MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_flax_auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING"])
+  _import_structure['modeling_flax_auto'].extend(['AutoFlaxLLM', 'MODEL_FLAX_MAPPING'])
  if t.TYPE_CHECKING: from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM
 try:
  if not is_tf_available(): raise openllm.exceptions.MissingDependencyError
 except openllm.exceptions.MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_tf_auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING"])
+  _import_structure['modeling_tf_auto'].extend(['AutoTFLLM', 'MODEL_TF_MAPPING'])
  if t.TYPE_CHECKING: from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING, AutoTFLLM as AutoTFLLM

-__lazy = LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
+__lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
 __all__ = __lazy.__all__
 __dir__ = __lazy.__dir__
 __getattr__ = __lazy.__getattr__
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -25,20 +25,20 @@ class BaseAutoLLMClass:
  @classmethod
  def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False,
                **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]:
-    """The lower level API for creating a LLM instance.
+    '''The lower level API for creating a LLM instance.

    ```python
    >>> import openllm
    >>> llm = openllm.AutoLLM.for_model("flan-t5")
    ```
-    """
+    '''
    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
    if ensure_available: llm.ensure_model_id_exists()
    return llm

  @classmethod
  def create_runner(cls, model: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
-    """Create a LLM Runner for the given model name.
+    '''Create a LLM Runner for the given model name.

    Args:
    model: The model name to instantiate.
@@ -47,7 +47,7 @@ class BaseAutoLLMClass:

    Returns:
    A LLM instance.
-    """
+    '''
    runner_kwargs_name = set(inspect.signature(openllm.LLM[t.Any, t.Any].to_runner).parameters)
    runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name}
    for k in runner_attrs:
@@ -56,15 +56,15 @@ class BaseAutoLLMClass:

  @classmethod
  def register(cls, config_class: type[openllm.LLMConfig], llm_class: type[openllm.LLM[t.Any, t.Any]]) -> None:
-    """Register a new model for this class.
+    '''Register a new model for this class.

    Args:
    config_class: The configuration corresponding to the model to register.
    llm_class: The runnable to register.
-    """
-    if hasattr(llm_class, "config_class") and llm_class.config_class is not config_class:
+    '''
+    if hasattr(llm_class, 'config_class') and llm_class.config_class is not config_class:
      raise ValueError(
-          f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!"
+          f'The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!'
      )
    cls._model_mapping.register(config_class, llm_class)

@@ -80,13 +80,13 @@ def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any:
  if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr)
  if hasattr(module, attr): return getattr(module, attr)
  # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the object at the top level.
-  openllm_module = importlib.import_module("openllm")
+  openllm_module = importlib.import_module('openllm')
  if module != openllm_module:
    try:
      return getattribute_from_module(openllm_module, attr)
    except ValueError:
-      raise ValueError(f"Could not find {attr} neither in {module} nor in {openllm_module}!") from None
-  raise ValueError(f"Could not find {attr} in {openllm_module}!")
+      raise ValueError(f'Could not find {attr} neither in {module} nor in {openllm_module}!') from None
+  raise ValueError(f'Could not find {attr} in {openllm_module}!')
 class _LazyAutoMapping(OrderedDict, ReprMixin):
  """Based on transformers.models.auto.configuration_auto._LazyAutoMapping.

@@ -112,7 +112,7 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):

  def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any:
    module_name = inflection.underscore(model_type)
-    if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f".{module_name}", "openllm.models")
+    if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f'.{module_name}', 'openllm.models')
    return getattribute_from_module(self._modules[module_name], attr)

  def __len__(self) -> int:
@@ -133,33 +133,33 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):

  def keys(self) -> ConfigModelKeysView:
    return t.cast(
-        "ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())
+        'ConfigModelKeysView', [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())
    )

  def values(self) -> ConfigModelValuesView:
    return t.cast(
-        "ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(
+        'ConfigModelValuesView', [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(
            self._extra_content.values()
        )
    )

  def items(self) -> ConfigModelItemsView:
    return t.cast(
-        "ConfigModelItemsView",
+        'ConfigModelItemsView',
        [(self._load_attr_from_module(key, self._config_mapping[key]),
          self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())
    )

  def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
-    return iter(t.cast("SupportsIter[t.Iterator[type[openllm.LLMConfig]]]", self.keys()))
+    return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))

  def __contains__(self, item: t.Any) -> bool:
    if item in self._extra_content: return True
-    if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: return False
+    if not hasattr(item, '__name__') or item.__name__ not in self._reverse_config_mapping: return False
    return self._reverse_config_mapping[item.__name__] in self._model_mapping

  def register(self, key: t.Any, value: t.Any) -> None:
-    if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
+    if hasattr(key, '__name__') and key.__name__ in self._reverse_config_mapping:
      if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.")
    self._extra_content[key] = value
-__all__ = ["BaseAutoLLMClass", "_LazyAutoMapping"]
+__all__ = ['BaseAutoLLMClass', '_LazyAutoMapping']
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -3,9 +3,9 @@ import typing as t
 from collections import OrderedDict
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
 from openllm_core.config import CONFIG_MAPPING_NAMES
-MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), (
-    "opt", "OPT"
-), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
+MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), (
+    'opt', 'OPT'
+), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 class AutoLLM(BaseAutoLLMClass):
  _model_mapping: t.ClassVar = MODEL_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py
@@ -3,7 +3,7 @@ import typing as t
 from collections import OrderedDict
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
 from openllm_core.config import CONFIG_MAPPING_NAMES
-MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5"), ("opt", "FlaxOPT")])
+MODEL_FLAX_MAPPING_NAMES = OrderedDict([('flan_t5', 'FlaxFlanT5'), ('opt', 'FlaxOPT')])
 MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)
 class AutoFlaxLLM(BaseAutoLLMClass):
  _model_mapping: t.ClassVar = MODEL_FLAX_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py
@@ -3,7 +3,7 @@ import typing as t
 from collections import OrderedDict
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
 from openllm_core.config import CONFIG_MAPPING_NAMES
-MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5"), ("opt", "TFOPT")])
+MODEL_TF_MAPPING_NAMES = OrderedDict([('flan_t5', 'TFFlanT5'), ('opt', 'TFOPT')])
 MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)
 class AutoTFLLM(BaseAutoLLMClass):
  _model_mapping: t.ClassVar = MODEL_TF_MAPPING
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -3,9 +3,9 @@ import typing as t
 from collections import OrderedDict
 from .factory import BaseAutoLLMClass, _LazyAutoMapping
 from openllm_core.config import CONFIG_MAPPING_NAMES
-MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), (
-    "opt", "VLLMOPT"
-), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
+MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), (
+    'opt', 'VLLMOPT'
+), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)
 class AutoVLLM(BaseAutoLLMClass):
  _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING
--- a/openllm-python/src/openllm/models/baichuan/init.py
+++ b/openllm-python/src/openllm/models/baichuan/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_baichuan"] = ["Baichuan"]
+  _import_structure['modeling_baichuan'] = ['Baichuan']
  if t.TYPE_CHECKING: from .modeling_baichuan import Baichuan as Baichuan
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_baichuan"] = ["VLLMBaichuan"]
+  _import_structure['modeling_vllm_baichuan'] = ['VLLMBaichuan']
  if t.TYPE_CHECKING: from .modeling_vllm_baichuan import VLLMBaichuan as VLLMBaichuan

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers
-class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
+class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
-    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
+    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
-class VLLMBaichuan(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
+class VLLMBaichuan(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/chatglm/init.py
+++ b/openllm-python/src/openllm/models/chatglm/init.py
@@ -9,7 +9,7 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_chatglm"] = ["ChatGLM"]
+  _import_structure['modeling_chatglm'] = ['ChatGLM']
  if t.TYPE_CHECKING: from .modeling_chatglm import ChatGLM as ChatGLM

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers
-class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
+class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerFast']):
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
@@ -17,7 +17,7 @@ class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrain
    embeddings: list[list[float]] = []
    num_tokens = 0
    for prompt in prompts:
-      input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+      input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
      with torch.inference_mode():
        outputs = self.model(input_ids, output_hidden_states=True)
        data = F.normalize(torch.mean(outputs.hidden_states[-1].transpose(0, 1), dim=0), p=2, dim=0)
--- a/openllm-python/src/openllm/models/dolly_v2/init.py
+++ b/openllm-python/src/openllm/models/dolly_v2/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_dolly_v2"] = ["DollyV2"]
+  _import_structure['modeling_dolly_v2'] = ['DollyV2']
  if t.TYPE_CHECKING: from .modeling_dolly_v2 import DollyV2 as DollyV2
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_dolly_v2"] = ["VLLMDollyV2"]
+  _import_structure['modeling_vllm_dolly_v2'] = ['VLLMDollyV2']
  if t.TYPE_CHECKING: from .modeling_vllm_dolly_v2 import VLLMDollyV2 as VLLMDollyV2

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -4,7 +4,7 @@ from openllm_core._typing_compat import overload
 from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id

 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
-else:  torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
+else:  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
 logger = logging.getLogger(__name__)
@overload
 def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline:
@@ -31,25 +31,25 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
          response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
          end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
          # Ensure generation stops once it generates "### End"
-          generate_kwargs["eos_token_id"] = end_key_token_id
+          generate_kwargs['eos_token_id'] = end_key_token_id
        except ValueError:
          pass
      forward_params = generate_kwargs
-      postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id}
-      if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text
+      postprocess_params = {'response_key_token_id': response_key_token_id, 'end_key_token_id': end_key_token_id}
+      if return_full_text is not None: postprocess_params['return_full_text'] = return_full_text
      return preprocess_params, forward_params, postprocess_params

    def preprocess(self, input_: str, **generate_kwargs: t.Any) -> t.Dict[str, t.Any]:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
      prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_)
-      inputs = self.tokenizer(prompt_text, return_tensors="pt")
-      inputs["prompt_text"] = prompt_text
-      inputs["instruction_text"] = input_
+      inputs = self.tokenizer(prompt_text, return_tensors='pt')
+      inputs['prompt_text'] = prompt_text
+      inputs['instruction_text'] = input_
      return t.cast(t.Dict[str, t.Any], inputs)

    def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
-      input_ids, attention_mask = input_tensors["input_ids"], input_tensors.get("attention_mask", None)
+      input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
      if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
      else: in_b = input_ids.shape[0]
      generated_sequence = self.model.generate(
@@ -59,16 +59,16 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
          **generate_kwargs
      )
      out_b = generated_sequence.shape[0]
-      if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
-      elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
-      instruction_text = input_tensors.pop("instruction_text")
-      return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
+      if self.framework == 'pt': generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+      elif self.framework == 'tf': generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+      instruction_text = input_tensors.pop('instruction_text')
+      return {'generated_sequence': generated_sequence, 'input_ids': input_ids, 'instruction_text': instruction_text}

-    def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal["generated_text"], str]]:
+    def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal['generated_text'], str]]:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
-      _generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"]
+      _generated_sequence, instruction_text = model_outputs['generated_sequence'][0], model_outputs['instruction_text']
      generated_sequence: list[list[int]] = _generated_sequence.numpy().tolist()
-      records: list[dict[t.Literal["generated_text"], str]] = []
+      records: list[dict[t.Literal['generated_text'], str]] = []
      for sequence in generated_sequence:
        # The response will be set to this variable if we can identify it.
        decoded = None
@@ -80,7 +80,7 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
            response_pos = sequence.index(response_key_token_id)
          except ValueError:
            response_pos = None
-          if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence)
+          if response_pos is None: logger.warning('Could not find response key %s in: %s', response_key_token_id, sequence)
          if response_pos:
            # Next find where "### End" is located.  The model has been trained to end its responses with this
            # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
@@ -96,33 +96,33 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr
          fully_decoded = self.tokenizer.decode(sequence)
          # The response appears after "### Response:".  The model has been trained to append "### End" at the
          # end.
-          m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
+          m = re.search(r'#+\s*Response:\s*(.+?)#+\s*End', fully_decoded, flags=re.DOTALL)
          if m: decoded = m.group(1).strip()
          else:
            # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
            # return everything after "### Response:".
-            m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
+            m = re.search(r'#+\s*Response:\s*(.+)', fully_decoded, flags=re.DOTALL)
            if m: decoded = m.group(1).strip()
-            else: logger.warning("Failed to find response in:\n%s", fully_decoded)
+            else: logger.warning('Failed to find response in:\n%s', fully_decoded)
        # If the full text is requested, then append the decoded text to the original instruction.
        # This technically isn't the full text, as we format the instruction in the prompt the model has been
        # trained on, but to the client it will appear to be the full text.
-        if return_full_text: decoded = f"{instruction_text}\n{decoded}"
-        records.append({"generated_text": t.cast(str, decoded)})
+        if return_full_text: decoded = f'{instruction_text}\n{decoded}'
+        records.append({'generated_text': t.cast(str, decoded)})
      return records

  return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline
-class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]):
+class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedTokenizer']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {}
+    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
    return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)

-  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
+  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
    llm_config = self.config.model_construct_env(**attrs)
    with torch.inference_mode():
      return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_vllm_dolly_v2.py
@@ -3,6 +3,6 @@ import logging, typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers

 logger = logging.getLogger(__name__)
-class VLLMDollyV2(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizer"]):
+class VLLMDollyV2(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizer']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/falcon/init.py
+++ b/openllm-python/src/openllm/models/falcon/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_falcon"] = ["Falcon"]
+  _import_structure['modeling_falcon'] = ['Falcon']
  if t.TYPE_CHECKING: from .modeling_falcon import Falcon as Falcon
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_falcon"] = ["VLLMFalcon"]
+  _import_structure['modeling_vllm_falcon'] = ['VLLMFalcon']
  if t.TYPE_CHECKING: from .modeling_vllm_falcon import VLLMFalcon as VLLMFalcon

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -1,32 +1,32 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import torch, transformers
-else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
-class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
+else: torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')
+class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
+    return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
-    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):  # type: ignore[attr-defined]
+    eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
      return self.tokenizer.batch_decode(
          self.model.generate(
-              input_ids=inputs["input_ids"],
-              attention_mask=inputs["attention_mask"],
+              input_ids=inputs['input_ids'],
+              attention_mask=inputs['attention_mask'],
              generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()
          ),
          skip_special_tokens=True
      )

-  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
-    src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
-    result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
-    return [{"generated_text": result}]
+    return [{'generated_text': result}]
--- a/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_vllm_falcon.py
@@ -3,6 +3,6 @@ import logging, typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers

 logger = logging.getLogger(__name__)
-class VLLMFalcon(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
+class VLLMFalcon(openllm.LLM['vllm.LLMEngine', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/flan_t5/init.py
+++ b/openllm-python/src/openllm/models/flan_t5/init.py
@@ -9,21 +9,21 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_flan_t5"] = ["FlanT5"]
+  _import_structure['modeling_flan_t5'] = ['FlanT5']
  if t.TYPE_CHECKING: from .modeling_flan_t5 import FlanT5 as FlanT5
 try:
  if not is_flax_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"]
+  _import_structure['modeling_flax_flan_t5'] = ['FlaxFlanT5']
  if t.TYPE_CHECKING: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5
 try:
  if not is_tf_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"]
+  _import_structure['modeling_tf_flan_t5'] = ['TFFlanT5']
  if t.TYPE_CHECKING: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers
-class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
+class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    with torch.inference_mode():
      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
          skip_special_tokens=True
      )

@@ -17,7 +17,7 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer
    embeddings: list[list[float]] = []
    num_tokens = 0
    for prompt in prompts:
-      input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+      input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
      with torch.inference_mode():
        outputs = self.model(input_ids, decoder_input_ids=input_ids)
        data = F.normalize(torch.mean(outputs.encoder_last_hidden_state[0], dim=0), p=2, dim=0)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -3,7 +3,7 @@ import typing as t, openllm
 from openllm_core._prompt import process_prompt
 from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import transformers
-class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
+class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
  __openllm_internal__ = True

  def sanitize_parameters(
@@ -20,20 +20,20 @@ class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "tra
  ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    if decoder_start_token_id is None: decoder_start_token_id = 0
    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_k": top_k,
-        "top_p": top_p,
-        "repetition_penalty": repetition_penalty,
-        "decoder_start_token_id": decoder_start_token_id
+        'max_new_tokens': max_new_tokens,
+        'temperature': temperature,
+        'top_k': top_k,
+        'top_p': top_p,
+        'repetition_penalty': repetition_penalty,
+        'decoder_start_token_id': decoder_start_token_id
    }, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
-    decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
+    decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
    return self.tokenizer.batch_decode(
        self.model.generate(
-            self.tokenizer(prompt, return_tensors="np")["input_ids"],
+            self.tokenizer(prompt, return_tensors='np')['input_ids'],
            do_sample=True,
            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
            decoder_start_token_id=decoder_start_token_id
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers
-class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]):
+class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transformers.T5TokenizerFast']):
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    return self.tokenizer.batch_decode(
-        self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+        self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
        skip_special_tokens=True
    )
--- a/openllm-python/src/openllm/models/gpt_neox/init.py
+++ b/openllm-python/src/openllm/models/gpt_neox/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_gpt_neox"] = ["GPTNeoX"]
+  _import_structure['modeling_gpt_neox'] = ['GPTNeoX']
  if t.TYPE_CHECKING: from .modeling_gpt_neox import GPTNeoX as GPTNeoX
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_gpt_neox"] = ["VLLMGPTNeoX"]
+  _import_structure['modeling_vllm_gpt_neox'] = ['VLLMGPTNeoX']
  if t.TYPE_CHECKING: from .modeling_vllm_gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -3,13 +3,13 @@ import logging, typing as t, openllm
 if t.TYPE_CHECKING: import transformers

 logger = logging.getLogger(__name__)
-class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
+class GPTNeoX(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
+    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
    import transformers
@@ -22,7 +22,7 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
    with torch.inference_mode():
      return self.tokenizer.batch_decode(
          self.model.generate(
-              self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids,
+              self.tokenizer(prompt, return_tensors='pt').to(self.device).input_ids,
              do_sample=True,
              generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
              pad_token_id=self.tokenizer.eos_token_id,
--- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
+++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
-class VLLMGPTNeoX(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
+class VLLMGPTNeoX(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/llama/init.py
+++ b/openllm-python/src/openllm/models/llama/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_llama"] = ["VLLMLlama"]
+  _import_structure['modeling_vllm_llama'] = ['VLLMLlama']
  if t.TYPE_CHECKING: from .modeling_vllm_llama import VLLMLlama as VLLMLlama
 try:
  if not is_torch_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_llama"] = ["Llama"]
+  _import_structure['modeling_llama'] = ['Llama']
  if t.TYPE_CHECKING: from .modeling_llama import Llama as Llama

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -1,18 +1,18 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers
-class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]):
+class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaTokenizerFast']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}

  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
    import torch, torch.nn.functional as F
-    encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
-    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+    encoding = self.tokenizer(prompts, padding=True, return_tensors='pt').to(self.device)
+    input_ids, attention_mask = encoding['input_ids'], encoding['attention_mask']
    with torch.inference_mode():
      data = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
      mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
--- a/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_vllm_llama.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
-class VLLMLlama(openllm.LLM["vllm.LLMEngine", "transformers.LlamaTokenizerFast"]):
+class VLLMLlama(openllm.LLM['vllm.LLMEngine', 'transformers.LlamaTokenizerFast']):
  __openllm_internal__ = True
--- a/openllm-python/src/openllm/models/mpt/init.py
+++ b/openllm-python/src/openllm/models/mpt/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_mpt"] = ["MPT"]
+  _import_structure['modeling_mpt'] = ['MPT']
  if t.TYPE_CHECKING: from .modeling_mpt import MPT as MPT
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_mpt"] = ["VLLMMPT"]
+  _import_structure['modeling_vllm_mpt'] = ['VLLMMPT']
  if t.TYPE_CHECKING: from .modeling_vllm_mpt import VLLMMPT as VLLMMPT

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -9,8 +9,8 @@ def get_mpt_config(
 ) -> transformers.PretrainedConfig:
  import torch
  config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-  if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
-  if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton"
+  if hasattr(config, 'init_device') and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device)
+  if hasattr(config, 'attn_config') and is_triton_available(): config.attn_config['attn_impl'] = 'triton'
  else:
    logger.debug(
        "'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'"
@@ -18,7 +18,7 @@ def get_mpt_config(
  # setting max_seq_len
  config.max_seq_len = max_sequence_length
  return config
-class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]):
+class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True

  def llm_post_init(self) -> None:
@@ -28,28 +28,28 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}
+    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {}

  def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model:
    import torch, transformers
    _, tokenizer_attrs = self.llm_parameters
-    torch_dtype = attrs.pop("torch_dtype", self.dtype)
-    device_map = attrs.pop("device_map", None)
-    attrs.pop("low_cpu_mem_usage", None)
+    torch_dtype = attrs.pop('torch_dtype', self.dtype)
+    device_map = attrs.pop('device_map', None)
+    attrs.pop('low_cpu_mem_usage', None)
    config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
    if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel:
    import transformers
-    torch_dtype = attrs.pop("torch_dtype", self.dtype)
-    device_map = attrs.pop("device_map", None)
-    trust_remote_code = attrs.pop("trust_remote_code", True)
+    torch_dtype = attrs.pop('torch_dtype', self.dtype)
+    device_map = attrs.pop('device_map', None)
+    trust_remote_code = attrs.pop('trust_remote_code', True)
    config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,)
    model = transformers.AutoModelForCausalLM.from_pretrained(
        self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs
@@ -60,16 +60,16 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    llm_config = self.config.model_construct_env(**attrs)
-    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
    attrs = {
-        "do_sample": False if llm_config["temperature"] == 0 else True,
-        "eos_token_id": self.tokenizer.eos_token_id,
-        "pad_token_id": self.tokenizer.pad_token_id,
-        "generation_config": llm_config.to_generation_config()
+        'do_sample': False if llm_config['temperature'] == 0 else True,
+        'eos_token_id': self.tokenizer.eos_token_id,
+        'pad_token_id': self.tokenizer.pad_token_id,
+        'generation_config': llm_config.to_generation_config()
    }
    with torch.inference_mode():
      if torch.cuda.is_available():
-        with torch.autocast("cuda", torch.float16):  # type: ignore[attr-defined]
+        with torch.autocast('cuda', torch.float16):  # type: ignore[attr-defined]
          generated_tensors = self.model.generate(**inputs, **attrs)
      else:
        generated_tensors = self.model.generate(**inputs, **attrs)
--- a/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_vllm_mpt.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers, vllm
-class VLLMMPT(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
+class VLLMMPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/opt/init.py
+++ b/openllm-python/src/openllm/models/opt/init.py
@@ -9,28 +9,28 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_opt"] = ["OPT"]
+  _import_structure['modeling_opt'] = ['OPT']
  if t.TYPE_CHECKING: from .modeling_opt import OPT as OPT
 try:
  if not is_flax_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_flax_opt"] = ["FlaxOPT"]
+  _import_structure['modeling_flax_opt'] = ['FlaxOPT']
  if t.TYPE_CHECKING: from .modeling_flax_opt import FlaxOPT as FlaxOPT
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_opt"] = ["VLLMOPT"]
+  _import_structure['modeling_vllm_opt'] = ['VLLMOPT']
  if t.TYPE_CHECKING: from .modeling_vllm_opt import VLLMOPT as VLLMOPT
 try:
  if not is_tf_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_tf_opt"] = ["TFOPT"]
+  _import_structure['modeling_tf_opt'] = ['TFOPT']
  if t.TYPE_CHECKING: from .modeling_tf_opt import TFOPT as TFOPT

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -4,17 +4,17 @@ from openllm._prompt import process_prompt
 from openllm.utils import generate_labels
 from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import transformers
-else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
+else: transformers = openllm.utils.LazyLoader('transformers', globals(), 'transformers')

 logger = logging.getLogger(__name__)
-class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
+class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']):
  __openllm_internal__ = True

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
    return bentoml.transformers.save_model(
-        self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)
+        self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)
    )

  def sanitize_parameters(
@@ -29,11 +29,11 @@ class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tok
      **attrs: t.Any
  ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty
+        'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences, 'repetition_penalty': repetition_penalty
    }, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    return self.tokenizer.batch_decode(
-        self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
+        self.model.generate(**self.tokenizer(prompt, return_tensors='np'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
        skip_special_tokens=True
    )
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -3,18 +3,18 @@ import logging, typing as t, openllm
 if t.TYPE_CHECKING: import transformers

 logger = logging.getLogger(__name__)
-class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
+class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    with torch.inference_mode():
      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
          skip_special_tokens=True
      )
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import typing as t, bentoml, openllm
 from openllm_core.utils import generate_labels
 if t.TYPE_CHECKING: import transformers
-class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]):
+class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tokenizer']):
  __openllm_internal__ = True

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
@@ -12,12 +12,12 @@ class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Token
    return bentoml.transformers.save_model(
        self.tag,
        transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
-        custom_objects={"tokenizer": tokenizer},
+        custom_objects={'tokenizer': tokenizer},
        labels=generate_labels(self)
    )

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    return self.tokenizer.batch_decode(
-        self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+        self.model.generate(**self.tokenizer(prompt, return_tensors='tf'), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
        skip_special_tokens=True
    )
--- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py
@@ -3,9 +3,9 @@ import typing as t, openllm
 from openllm_core._prompt import process_prompt
 from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE
 if t.TYPE_CHECKING: import vllm, transformers
-class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]):
+class VLLMOPT(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2Tokenizer']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'

  def sanitize_parameters(
      self,
@@ -18,5 +18,5 @@ class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]):
      **attrs: t.Any
  ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
    return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {
-        "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences
+        'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'num_return_sequences': num_return_sequences
    }, {}
--- a/openllm-python/src/openllm/models/stablelm/init.py
+++ b/openllm-python/src/openllm/models/stablelm/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_stablelm"] = ["StableLM"]
+  _import_structure['modeling_stablelm'] = ['StableLM']
  if t.TYPE_CHECKING: from .modeling_stablelm import StableLM as StableLM
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_stablelm"] = ["VLLMStableLM"]
+  _import_structure['modeling_vllm_stablelm'] = ['VLLMStableLM']
  if t.TYPE_CHECKING: from .modeling_vllm_stablelm import VLLMStableLM as VLLMStableLM

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 import typing as t, openllm
 if t.TYPE_CHECKING: import transformers
-class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]):
+class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True

  def llm_post_init(self) -> None:
@@ -11,7 +11,7 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    return {'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
@@ -19,7 +19,7 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN
      return [
          self.tokenizer.decode(
              self.model.generate(
-                  **self.tokenizer(prompt, return_tensors="pt").to(self.device),
+                  **self.tokenizer(prompt, return_tensors='pt').to(self.device),
                  do_sample=True,
                  generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
                  pad_token_id=self.tokenizer.eos_token_id,
--- a/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_vllm_stablelm.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import logging, typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
-class VLLMStableLM(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]):
+class VLLMStableLM(openllm.LLM['vllm.LLMEngine', 'transformers.GPTNeoXTokenizerFast']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'
--- a/openllm-python/src/openllm/models/starcoder/init.py
+++ b/openllm-python/src/openllm/models/starcoder/init.py
@@ -9,14 +9,14 @@ try:
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_starcoder"] = ["StarCoder"]
+  _import_structure['modeling_starcoder'] = ['StarCoder']
  if t.TYPE_CHECKING: from .modeling_starcoder import StarCoder as StarCoder
 try:
  if not is_vllm_available(): raise MissingDependencyError
 except MissingDependencyError:
  pass
 else:
-  _import_structure["modeling_vllm_starcoder"] = ["VLLMStarCoder"]
+  _import_structure['modeling_vllm_starcoder'] = ['VLLMStarCoder']
  if t.TYPE_CHECKING: from .modeling_vllm_starcoder import VLLMStarCoder as VLLMStarCoder

-sys.modules[__name__] = LazyModule(__name__, globals()["__file__"], _import_structure)
+sys.modules[__name__] = LazyModule(__name__, globals()['__file__'], _import_structure)
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -3,22 +3,22 @@ import logging, typing as t, bentoml, openllm
 from openllm.utils import generate_labels
 from openllm_core.config.configuration_starcoder import EOD, FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
 if t.TYPE_CHECKING: import transformers
-class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]):
+class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.GPT2TokenizerFast']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
    import torch
-    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
+    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.float16 if torch.cuda.is_available() else torch.float32}, {}

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
    import torch, transformers
-    torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto")
+    torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD})
+    tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -28,7 +28,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
      # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
      # NOTE: support fine-tuning starcoder
      result_tensor = self.model.generate(
-          self.tokenizer.encode(prompt, return_tensors="pt").to(self.device),
+          self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
          do_sample=True,
          pad_token_id=self.tokenizer.eos_token_id,
          generation_config=self.config.model_construct_env(**attrs).to_generation_config()
@@ -37,12 +37,12 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.
      # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
      return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

-  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device)
-    src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
-    result = self.tokenizer.decode(self.model.generate(encoded_inputs["input_ids"], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+    result = self.tokenizer.decode(self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
-    return [{"generated_text": result}]
+    return [{'generated_text': result}]
--- a/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_vllm_starcoder.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import logging, typing as t, openllm
 if t.TYPE_CHECKING: import vllm, transformers
-class VLLMStarCoder(openllm.LLM["vllm.LLMEngine", "transformers.GPT2TokenizerFast"]):
+class VLLMStarCoder(openllm.LLM['vllm.LLMEngine', 'transformers.GPT2TokenizerFast']):
  __openllm_internal__ = True
-  tokenizer_id = "local"
+  tokenizer_id = 'local'