fix(yapf): align weird new lines break [generated] [skip ci] (#284)

fix(yapf): align weird new lines break Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-05-19 05:57:39 -04:00 · 2023-09-01 05:34:22 -04:00
parent 3e45530abd
commit b7af7765d4
91 changed files with 811 additions and 1678 deletions
--- a/openllm-python/src/openllm/models/auto/factory.py
+++ b/openllm-python/src/openllm/models/auto/factory.py
@@ -30,9 +30,7 @@ class BaseAutoLLMClass:
  _model_mapping: t.ClassVar[_LazyAutoMapping]

  def __init__(self, *args: t.Any, **attrs: t.Any):
-    raise EnvironmentError(
-        f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead."
-    )
+    raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.")

  @classmethod
  def for_model(cls,
@@ -50,10 +48,7 @@ class BaseAutoLLMClass:
    >>> llm = openllm.AutoLLM.for_model("flan-t5")
    ```
    '''
-    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id,
-                                                           model_version=model_version,
-                                                           llm_config=llm_config,
-                                                           **attrs)
+    llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
    if ensure_available: llm.ensure_model_id_exists()
    return llm

@@ -116,9 +111,7 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
  This OrderedDict values() and keys() returns the list instead, so you don't
  have to do list(mapping.values()) to get the list of values.
  """
-
-  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString],
-               model_mapping: OrderedDict[LiteralString, LiteralString]):
+  def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]):
    self._config_mapping = config_mapping
    self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
    self._model_mapping = model_mapping
@@ -153,32 +146,26 @@ class _LazyAutoMapping(OrderedDict, ReprMixin):
    return ReprMixin.__repr__(self)

  def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]:
-    yield from ((key, (value, self._model_mapping[key]))
-                for key, value in self._config_mapping.items()
-                if key in self._model_mapping)
+    yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping)

  def __bool__(self) -> bool:
    return bool(self.keys())

  def keys(self) -> ConfigModelKeysView:
-    return t.cast('ConfigModelKeysView', [
-        self._load_attr_from_module(key, name)
-        for key, name in self._config_mapping.items()
-        if key in self._model_mapping.keys()
-    ] + list(self._extra_content.keys()))
+    return t.cast('ConfigModelKeysView',
+                  [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] +
+                  list(self._extra_content.keys()))

  def values(self) -> ConfigModelValuesView:
-    return t.cast('ConfigModelValuesView', [
-        self._load_attr_from_module(key, name)
-        for key, name in self._model_mapping.items()
-        if key in self._config_mapping.keys()
-    ] + list(self._extra_content.values()))
+    return t.cast('ConfigModelValuesView',
+                  [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] +
+                  list(self._extra_content.values()))

  def items(self) -> ConfigModelItemsView:
-    return t.cast('ConfigModelItemsView', [(self._load_attr_from_module(
-        key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
-                                           for key in self._model_mapping.keys()
-                                           if key in self._config_mapping.keys()] + list(self._extra_content.items()))
+    return t.cast('ConfigModelItemsView',
+                  [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key]))
+                   for key in self._model_mapping.keys()
+                   if key in self._config_mapping.keys()] + list(self._extra_content.items()))

  def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]:
    return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys()))
--- a/openllm-python/src/openllm/models/auto/modeling_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_auto.py
@@ -7,10 +7,9 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass
 from .factory import _LazyAutoMapping

-MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'),
-                                   ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'),
-                                   ('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'),
-                                   ('baichuan', 'Baichuan')])
+MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'),
+                                   ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'),
+                                   ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')])
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)

 class AutoLLM(BaseAutoLLMClass):
--- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
+++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py
@@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES
 from .factory import BaseAutoLLMClass
 from .factory import _LazyAutoMapping

-MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'),
-                                        ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'),
-                                        ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
+MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'),
+                                        ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'),
                                        ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')])
 MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)

--- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
+++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py
@@ -11,6 +11,5 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai
    import torch
    inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)
    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      outputs = self.model.generate(**inputs,
-                                    generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py
@@ -14,9 +14,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain
      self.model.eval()
      # Only use half precision if the model is not yet quantized
      if self.config.use_half_precision: self.model.half()
-      return self.model.chat(self.tokenizer,
-                             prompt,
-                             generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())

  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
--- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -11,8 +11,9 @@ from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY
 from openllm_core.config.configuration_dolly_v2 import get_special_token_id
 if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
 else:
-  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader(
-      'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow')
+  torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(),
+                                                                                                            'transformers'), openllm.utils.LazyLoader(
+                                                                                                                'tf', globals(), 'tensorflow')
 logger = logging.getLogger(__name__)

@overload
@@ -35,22 +36,8 @@ def get_pipeline(model: transformers.PreTrainedModel,
                 **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline:
  # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information.
  class InstructionTextGenerationPipeline(transformers.Pipeline):
-
-    def __init__(self,
-                 *args: t.Any,
-                 do_sample: bool = True,
-                 max_new_tokens: int = 256,
-                 top_p: float = 0.92,
-                 top_k: int = 0,
-                 **kwargs: t.Any):
-      super().__init__(*args,
-                       model=model,
-                       tokenizer=tokenizer,
-                       do_sample=do_sample,
-                       max_new_tokens=max_new_tokens,
-                       top_p=top_p,
-                       top_k=top_k,
-                       **kwargs)
+    def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any):
+      super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

    def _sanitize_parameters(self,
                             return_full_text: bool | None = None,
@@ -59,8 +46,7 @@ def get_pipeline(model: transformers.PreTrainedModel,
      preprocess_params: dict[str, t.Any] = {}
      # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
      # append a newline to yield a single token.  find whatever token is configured for the response key.
-      tokenizer_response_key = next(
-          (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
+      tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None)
      response_key_token_id = None
      end_key_token_id = None
      if tokenizer_response_key:
@@ -84,17 +70,15 @@ def get_pipeline(model: transformers.PreTrainedModel,
      inputs['instruction_text'] = input_
      return t.cast(t.Dict[str, t.Any], inputs)

-    def _forward(self, input_tensors: dict[str, t.Any],
-                 **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
+    def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput:
      if t.TYPE_CHECKING: assert self.tokenizer is not None
      input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None)
      if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1
      else: in_b = input_ids.shape[0]
-      generated_sequence = self.model.generate(
-          input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
-          attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
-          pad_token_id=self.tokenizer.pad_token_id,
-          **generate_kwargs)
+      generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None,
+                                               attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
+                                               pad_token_id=self.tokenizer.pad_token_id,
+                                               **generate_kwargs)
      out_b = generated_sequence.shape[0]
      if self.framework == 'pt':
        generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
@@ -162,10 +146,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {
-        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
-        'torch_dtype': torch.bfloat16
-    }, {}
+    return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {}

  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline:
    return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs),
@@ -176,6 +157,4 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken
  def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
    llm_config = self.config.model_construct_env(**attrs)
    with torch.inference_mode():
-      return self.model(prompt,
-                        return_full_text=llm_config.return_full_text,
-                        generation_config=llm_config.to_generation_config())
+      return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())
--- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py
+++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py
@@ -4,42 +4,31 @@ import typing as t
 import openllm
 if t.TYPE_CHECKING: import torch, transformers
 else:
-  torch, transformers = openllm.utils.LazyLoader('torch', globals(),
-                                                 'torch'), openllm.utils.LazyLoader('transformers', globals(),
-                                                                                    'transformers')
+  torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers')

 class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']):
  __openllm_internal__ = True

  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {
-        'torch_dtype': torch.bfloat16,
-        'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None
-    }, {}
+    return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    eos_token_id, inputs = attrs.pop('eos_token_id',
-                                     self.tokenizer.eos_token_id), self.tokenizer(prompt,
-                                                                                  return_tensors='pt').to(self.device)
+    eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device)
    with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16):  # type: ignore[attr-defined]
-      return self.tokenizer.batch_decode(self.model.generate(
-          input_ids=inputs['input_ids'],
-          attention_mask=inputs['attention_mask'],
-          generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()),
+      return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'],
+                                                             attention_mask=inputs['attention_mask'],
+                                                             generation_config=self.config.model_construct_env(eos_token_id=eos_token_id,
+                                                                                                               **attrs).to_generation_config()),
                                         skip_special_tokens=True)

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
-        prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
-        'stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
+                                                                                                    openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
    result = self.tokenizer.decode(
-        self.model.generate(encoded_inputs['input_ids'],
-                            max_new_tokens=max_new_tokens,
-                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+        self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -11,11 +11,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    with torch.inference_mode():
-      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                              do_sample=True,
-                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-          skip_special_tokens=True)
+      return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                                             do_sample=True,
+                                                             generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                         skip_special_tokens=True)

  def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput:
    import torch
--- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -32,10 +32,9 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation.
    decoder_start_token_id = attrs.pop('decoder_start_token_id', 0)
-    return self.tokenizer.batch_decode(self.model.generate(
-        self.tokenizer(prompt, return_tensors='np')['input_ids'],
-        do_sample=True,
-        generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-        decoder_start_token_id=decoder_start_token_id).sequences,
+    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='np')['input_ids'],
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+                                                           decoder_start_token_id=decoder_start_token_id).sequences,
                                       skip_special_tokens=True,
                                       clean_up_tokenization_spaces=True)
--- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
+++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py
@@ -8,8 +8,7 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo
  __openllm_internal__ = True

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(self.model.generate(
-        self.tokenizer(prompt, return_tensors='tf').input_ids,
-        do_sample=True,
-        generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+    return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids,
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -26,17 +26,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke
    return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(),
                                    num_tokens=int(torch.sum(attention_mask).item()))

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
-        prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
-        'stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
+                                                                                                    openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
    result = self.tokenizer.decode(
-        self.model.generate(encoded_inputs['input_ids'],
-                            max_new_tokens=max_new_tokens,
-                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+        self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]
--- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py
+++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py
@@ -48,11 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
    torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32)
    device_map = attrs.pop('device_map', None)
    attrs.pop('low_cpu_mem_usage', None)
-    config = get_mpt_config(self.model_id,
-                            self.config.max_sequence_length,
-                            self.device,
-                            device_map=device_map,
-                            trust_remote_code=trust_remote_code)
+    config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code)
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs)
    if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token
    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
@@ -62,10 +58,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
                                                              device_map=device_map,
                                                              **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag,
-                                             model,
-                                             custom_objects={'tokenizer': tokenizer},
-                                             labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -79,7 +72,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken
                            self.device,
                            device_map=device_map,
                            trust_remote_code=trust_remote_code,
-                           )
+                            )
    model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path,
                                                              config=config,
                                                              trust_remote_code=trust_remote_code,
--- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py
@@ -16,12 +16,11 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
  __openllm_internal__ = True

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
-    config, tokenizer = transformers.AutoConfig.from_pretrained(
-        self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
+        self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
    return bentoml.transformers.save_model(self.tag,
-                                           transformers.FlaxAutoModelForCausalLM.from_pretrained(
-                                               self.model_id, **attrs),
+                                           transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs),
                                           custom_objects={'tokenizer': tokenizer},
                                           labels=generate_labels(self))

@@ -45,6 +44,5 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'),
                                                           do_sample=True,
-                                                           generation_config=self.config.model_construct_env(
-                                                               **attrs).to_generation_config()).sequences,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences,
                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_opt.py
@@ -18,8 +18,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer
  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
    import torch
    with torch.inference_mode():
-      return self.tokenizer.batch_decode(
-          self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                              do_sample=True,
-                              generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-          skip_special_tokens=True)
+      return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                                             do_sample=True,
+                                                             generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                         skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
+++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py
@@ -11,18 +11,16 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token

  def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model:
    import transformers
-    config, tokenizer = transformers.AutoConfig.from_pretrained(
-        self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
+    config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(
+        self.model_id, **self.llm_parameters[-1])
    tokenizer.pad_token_id = config.pad_token_id
    return bentoml.transformers.save_model(self.tag,
-                                           transformers.TFOPTForCausalLM.from_pretrained(
-                                               self.model_id, trust_remote_code=trust_remote_code, **attrs),
+                                           transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs),
                                           custom_objects={'tokenizer': tokenizer},
                                           labels=generate_labels(self))

  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    return self.tokenizer.batch_decode(
-        self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
-                            do_sample=True,
-                            generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
-        skip_special_tokens=True)
+    return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='tf'),
+                                                           do_sample=True,
+                                                           generation_config=self.config.model_construct_env(**attrs).to_generation_config()),
+                                       skip_special_tokens=True)
--- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py
@@ -17,11 +17,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN
    import torch
    with torch.inference_mode():
      return [
-          self.tokenizer.decode(
-              self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
-                                  do_sample=True,
-                                  generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-                                  pad_token_id=self.tokenizer.eos_token_id,
-                                  stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
-              skip_special_tokens=True)
+          self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device),
+                                                    do_sample=True,
+                                                    generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+                                                    pad_token_id=self.tokenizer.eos_token_id,
+                                                    stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0],
+                                skip_special_tokens=True)
      ]
--- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py
@@ -28,19 +28,10 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
    import transformers
    torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto')
    tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1])
-    tokenizer.add_special_tokens({
-        'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD],
-        'pad_token': EOD
-    })
-    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id,
-                                                              torch_dtype=torch_dtype,
-                                                              device_map=device_map,
-                                                              **attrs)
+    tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD})
+    model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs)
    try:
-      return bentoml.transformers.save_model(self.tag,
-                                             model,
-                                             custom_objects={'tokenizer': tokenizer},
-                                             labels=generate_labels(self))
+      return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self))
    finally:
      torch.cuda.empty_cache()

@@ -49,26 +40,21 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers.
    with torch.inference_mode():
      # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
      # NOTE: support fine-tuning starcoder
-      result_tensor = self.model.generate(
-          self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
-          do_sample=True,
-          pad_token_id=self.tokenizer.eos_token_id,
-          generation_config=self.config.model_construct_env(**attrs).to_generation_config())
+      result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors='pt').to(self.device),
+                                          do_sample=True,
+                                          pad_token_id=self.tokenizer.eos_token_id,
+                                          generation_config=self.config.model_construct_env(**attrs).to_generation_config())
      # TODO: We will probably want to return the tokenizer here so that we can manually process this
      # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
      return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

-  def generate_one(self, prompt: str, stop: list[str],
-                   **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
-    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(
-        prompt, return_tensors='pt').to(self.device)
-    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop(
-        'stopping_criteria', openllm.StoppingCriteriaList([]))
+  def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]:
+    max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device)
+    src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria',
+                                                                                                    openllm.StoppingCriteriaList([]))
    stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer))
    result = self.tokenizer.decode(
-        self.model.generate(encoded_inputs['input_ids'],
-                            max_new_tokens=max_new_tokens,
-                            stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
+        self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:])
    # Inference API returns the stop sequence
    for stop_seq in stop:
      if result.endswith(stop_seq): result = result[:-len(stop_seq)]