From d04309188b8fccec6a3ff36a893099806f560551 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 28 Nov 2023 07:04:27 +0000
Subject: [PATCH] chore(style): 2.7k

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
---
 .../src/openllm_core/_configuration.py        | 43 ++-------
 openllm-python/src/openllm/_runners.py        | 94 +++++--------------
 2 files changed, 32 insertions(+), 105 deletions(-)

diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
index c7ca53b9..36230025 100644
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -985,9 +985,7 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
 
   def __setattr__(self, attr: str, value: t.Any) -> None:
     if attr in _reserved_namespace:
-      raise ForbiddenAttributeError(
-        f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.'
-      )
+      raise ForbiddenAttributeError(f'{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.')
     super().__setattr__(attr, value)
 
   def __init__(
@@ -1192,7 +1190,6 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
   @overload
   def __getitem__(self, item: t.Literal['lokr']) -> t.Dict[str, t.Any]: ...
   # update-config-stubs.py: stop
-
   def __getitem__(self, item: LiteralString | t.Any) -> t.Any:
     """Allowing access LLMConfig as a dictionary. The order will always evaluate as.
 
@@ -1224,20 +1221,12 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       return self.__openllm_extras__[item]
     else:
       raise KeyError(item)
-
   def __getattribute__(self, item: str) -> t.Any:
     if item in _reserved_namespace:
-      raise ForbiddenAttributeError(
-        f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified."
-      )
+      raise ForbiddenAttributeError(f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.")
     return _object_getattribute.__get__(self)(item)
-
-  def __len__(self) -> int:
-    return len(self.__openllm_accepted_keys__) + len(self.__openllm_extras__)
-
-  def keys(self) -> list[str]:
-    return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__)
-
+  def __len__(self) -> int: return len(self.__openllm_accepted_keys__) + len(self.__openllm_extras__)
+  def keys(self) -> list[str]: return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__)
   def values(self) -> list[t.Any]:
     return (
       [getattr(self, k.name) for k in attr.fields(self.__class__)]
@@ -1245,7 +1234,6 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)]
       + list(self.__openllm_extras__.values())
     )
-
   def items(self) -> list[tuple[str, t.Any]]:
     return (
       [(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)]
@@ -1253,13 +1241,9 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)]
       + list(self.__openllm_extras__.items())
     )
-
-  def __iter__(self) -> t.Iterator[str]:
-    return iter(self.keys())
-
+  def __iter__(self) -> t.Iterator[str]: return iter(self.keys())
   def __contains__(self, item: t.Any) -> bool:
-    if item in self.__openllm_extras__:
-      return True
+    if item in self.__openllm_extras__: return True
     return item in self.__openllm_accepted_keys__
 
   @classmethod
@@ -1429,12 +1413,10 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
         no_repeat_ngram_size=config['no_repeat_ngram_size'],
         end_token=config['stop'],
       )
-
   class pt:
     @staticmethod
     def build(config: LLMConfig) -> LLMConfig:
       return config
-
   class hf:
     @staticmethod
     def build(config: LLMConfig) -> transformers.GenerationConfig:
@@ -1442,10 +1424,8 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
 
   @overload
   def compatible_options(self, request: ChatCompletionRequest | CompletionRequest) -> dict[str, t.Any]: ...
-
   @overload
   def compatible_options(self, request: CohereChatRequest | CohereGenerateRequest) -> dict[str, t.Any]: ...
-
   def compatible_options(self, request: AttrsInstance) -> dict[str, t.Any]:
     if importlib.util.find_spec('openllm') is None:
       raise MissingDependencyError(
@@ -1460,7 +1440,6 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       return self.cohere.build(self, request)
     else:
       raise TypeError(f'Unknown request type {type(request)}')
-
   class openai:
     @staticmethod
     def build(config: LLMConfig, request: ChatCompletionRequest | CompletionRequest) -> dict[str, t.Any]:
@@ -1478,7 +1457,6 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
       if hasattr(request, 'logprobs'):
         d['logprobs'] = first_not_none(request.logprobs, default=config['logprobs'])
       return d
-
   class cohere:
     @staticmethod
     def build(config: LLMConfig, request: CohereGenerateRequest | CohereChatRequest) -> dict[str, t.Any]:
@@ -1502,21 +1480,12 @@ class LLMConfig(_ConfigAttr[GenerationConfig, SamplingParams]):
   def system_message(self) -> str: return ''
   @property
   def chat_template(self) -> str | None: return
-
   @property
   def chat_messages(self) -> list[MessageParam]:
     from ._schemas import MessageParam
     return [MessageParam(role='system', content='You are a helpful assistant'), MessageParam(role='user', content="Hello, I'm looking for a chatbot that can help me with my work."), MessageParam(role='assistant', content='Yes? What can I help you with?')]
-
   @classmethod
   def parse(cls, f: AnyCallable) -> click.Command:
-    """Convert current configuration to click options.
-
-    This can be used as a decorator for click commands.
-
-    > [!NOTE]
-    > The identifier for all LLMConfig will be prefixed with '<model_name>_*', and the generation config will be prefixed with '<model_name>_generation_*'.
-    """
     for name, field in attr.fields_dict(cls.__openllm_generation_class__).items():
       ty = cls.__openllm_hints__.get(name)
       # NOTE: Union type is currently not yet supported, we probably just need to use environment instead.
diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py
index 2dfb4ee7..ba692b10 100644
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -1,60 +1,43 @@
 from __future__ import annotations
-import gc
-import traceback
-import types
-import typing as t
-
-import torch
-
-import bentoml
-import openllm
+import gc, traceback, types, typing as t
+import torch, bentoml, openllm
 from openllm_core._schemas import CompletionChunk, GenerationOutput, SampleLogprobs
 from openllm_core.utils import ReprMixin, is_ctranslate_available, is_vllm_available
 
-__all__ = ['runner']
+if t.TYPE_CHECKING:
+  from openllm_core._typing_compat import M, T
+  from ._runners import Runner
 
 _registry = {}
-
+__all__ = ['runner']
 
 def registry(cls=None, *, alias=None):
   def decorator(_cls):
     _registry[_cls.__name__[:-8].lower() if alias is None else alias] = _cls
     return _cls
-
-  if cls is None:
-    return decorator
+  if cls is None: return decorator
   return decorator(cls)
 
-
-def runner(llm: openllm.LLM):
+def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
   try:
     assert llm.bentomodel
   except (bentoml.exceptions.NotFound, AssertionError) as err:
     raise RuntimeError(f'Failed to locate {llm.bentomodel}: {err}') from err
 
   return types.new_class(
-    llm.config.__class__.__name__[:-6] + 'Runner',
-    (bentoml.Runner,),
+    llm.config.__class__.__name__[:-6] + 'Runner', (bentoml.Runner,), #
     exec_body=lambda ns: ns.update(
       {
-        'llm_type': llm.llm_type,
-        'identifying_params': llm.identifying_params,
-        'llm_tag': llm.tag,
-        'llm': llm,
-        'config': llm.config,
-        'backend': llm.__llm_backend__,
+        'llm_type': llm.llm_type, 'identifying_params': llm.identifying_params, #
+        'llm_tag': llm.tag, 'llm': llm, 'config': llm.config, 'backend': llm.__llm_backend__, #
+        '__module__': llm.__module__, '__repr__': ReprMixin.__repr__, #
         '__doc__': llm.config.__class__.__doc__ or f'Generated Runner class for {llm.config["model_name"]}',
-        '__module__': llm.__module__,
-        '__repr__': ReprMixin.__repr__,
         '__repr_keys__': property(lambda _: {'config', 'llm_type', 'runner_methods', 'backend', 'llm_tag'}),
         '__repr_args__': lambda _: (
           (
             'runner_methods',
             {
-              method.name: {
-                'batchable': method.config.batchable,
-                'batch_dim': method.config.batch_dim if method.config.batchable else None,
-              }
+              method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None}
               for method in _.runner_methods
             },
           ),
@@ -76,47 +59,35 @@ def runner(llm: openllm.LLM):
     runnable_init_params={'llm': llm},
   )
 
-
 @registry
 class CTranslateRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
-
   def __init__(self, llm):
     if not is_ctranslate_available(): raise openllm.exceptions.OpenLLMException('ctranslate is not installed. Do `pip install "openllm[ctranslate]"`')
     self.llm, self.config, self.model, self.tokenizer = llm, llm.config, llm.model, llm.tokenizer
-
   @bentoml.Runnable.method(batchable=False)
   async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
     config, sampling_params = self.config.model_construct_env(stop=list(stop), **attrs).inference_options(self.llm)
     cumulative_logprob, output_token_ids, input_len = 0.0, list(prompt_token_ids), len(prompt_token_ids)
     tokens = self.tokenizer.convert_ids_to_tokens(prompt_token_ids)
-
     async for request_output in self.model.async_generate_tokens(tokens, **sampling_params):
-      if config['logprobs']:
-        cumulative_logprob += request_output.log_prob
+      if config['logprobs']: cumulative_logprob += request_output.log_prob
       output_token_ids.append(request_output.token_id)
       text = self.tokenizer.decode(
-        output_token_ids[input_len:],
-        skip_special_tokens=True,
-        spaces_between_special_tokens=False,
-        clean_up_tokenization_spaces=True,
+        output_token_ids[input_len:], skip_special_tokens=True, #
+        spaces_between_special_tokens=False, clean_up_tokenization_spaces=True, #
       )
       yield GenerationOutput(
-        prompt='',
-        finished=request_output.is_last,
+        prompt_token_ids=prompt_token_ids, #
+        prompt='', finished=request_output.is_last, request_id=request_id, #
         outputs=[
           CompletionChunk(
-            index=0,
-            text=text,
-            token_ids=output_token_ids[input_len:],
-            cumulative_logprob=cumulative_logprob,
-            finish_reason=None,
+            index=0, text=text, finish_reason=None, #
+            token_ids=output_token_ids[input_len:], cumulative_logprob=cumulative_logprob, #
             # TODO: logprobs, but seems like we don't have access to the raw logits
           )
         ],
-        prompt_token_ids=prompt_token_ids,
-        request_id=request_id,
       ).model_dump_json()
 
 
@@ -124,41 +95,30 @@ class CTranslateRunnable(bentoml.Runnable):
 class vLLMRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
-
   def __init__(self, llm):
     if not is_vllm_available(): raise openllm.exceptions.OpenLLMException('vLLM is not installed. Do `pip install "openllm[vllm]"`.')
     import vllm
 
     self.llm, self.config, self.tokenizer = llm, llm.config, llm.tokenizer
     num_gpus, dev = 1, openllm.utils.device_count()
-    if dev >= 2:
-      num_gpus = min(dev // 2 * 2, dev)
-
+    if dev >= 2: num_gpus = min(dev // 2 * 2, dev)
     try:
       self.model = vllm.AsyncLLMEngine.from_engine_args(
         vllm.AsyncEngineArgs(
-          tokenizer_mode='auto',
-          tensor_parallel_size=num_gpus,
-          worker_use_ray=False,
-          engine_use_ray=False,
-          model=llm.bentomodel.path,
-          tokenizer=llm.bentomodel.path,
-          trust_remote_code=llm.trust_remote_code,
-          dtype=llm._torch_dtype,
+          worker_use_ray=False, engine_use_ray=False, #
+          tokenizer_mode='auto', tensor_parallel_size=num_gpus, #
+          model=llm.bentomodel.path, tokenizer=llm.bentomodel.path, #
+          trust_remote_code=llm.trust_remote_code, dtype=llm._torch_dtype, #
           max_model_len=llm._max_model_len,
           quantization=llm.quantise if llm.quantise and llm.quantise in {'awq', 'squeezellm'} else None,
         )
       )
     except Exception as err:
       traceback.print_exc()
-      raise openllm.exceptions.OpenLLMException(
-        f'Failed to initialise vLLMEngine due to the following error:\n{err}'
-      ) from err
-
+      raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
   @bentoml.Runnable.method(batchable=False)
   async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
     config, sampling_params = self.config.model_construct_env(stop=stop, **attrs).inference_options(self.llm)
-
     async for request_output in self.model.generate(None, sampling_params, request_id, prompt_token_ids):
       yield GenerationOutput.from_vllm(request_output).model_dump_json()
 
@@ -167,7 +127,6 @@ class vLLMRunnable(bentoml.Runnable):
 class PyTorchRunnable(bentoml.Runnable):
   SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
   SUPPORTS_CPU_MULTI_THREADING = True
-
   def __init__(self, llm):
     self.llm, self.config, self.model, self.tokenizer = llm, llm.config, llm.model, llm.tokenizer
     self.is_encoder_decoder = llm.model.config.is_encoder_decoder
@@ -175,7 +134,6 @@ class PyTorchRunnable(bentoml.Runnable):
       self.device = llm.model.device
     else:
       self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
   @bentoml.Runnable.method(batchable=False)
   async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
     from ._generation import get_context_length, prepare_logits_processor