fix(sdk): remove broken sdk

codespace now around 2.8k lines Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-02-18 22:55:08 -05:00 · 2023-11-26 04:53:36 -05:00
parent ed6a82a3f0
commit 96318b65ee
18 changed files with 179 additions and 557 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -1,7 +1,6 @@
 import logging as _logging, os as _os, pathlib as _pathlib, warnings as _warnings
 from openllm_cli import _sdk
 from . import utils as utils
-
 if utils.DEBUG:
  utils.set_debug_mode(True); _logging.basicConfig(level=_logging.NOTSET)
 else:
@@ -12,11 +11,8 @@ else:
  _warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
  _warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
  _warnings.filterwarnings('ignore', message='Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated')
-
 COMPILED = _pathlib.Path(__file__).suffix in ('.pyd', '.so')
-
-# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
-__lazy = utils.LazyModule(
+__lazy = utils.LazyModule(  # NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
  __name__,
  globals()['__file__'],
  {
@@ -34,14 +30,8 @@ __lazy = utils.LazyModule(
    '_llm': ['LLM'],
  },
  extra_objects={
-    'COMPILED': COMPILED,
-    'start': _sdk.start,
-    'start_grpc': _sdk.start_grpc,
-    'build': _sdk.build,
-    'import_model': _sdk.import_model,
-    'list_models': _sdk.list_models,
+    'COMPILED': COMPILED, 'start': _sdk.start, 'build': _sdk.build, #
+    'import_model': _sdk.import_model, 'list_models': _sdk.list_models, #
  },
 )
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -1,65 +1,21 @@
 from __future__ import annotations
-import logging
-import os
-import typing as t
-import warnings
-
+import logging, os, warnings, typing as t
 import openllm
-from openllm_core._typing_compat import LiteralBackend, ParamSpec
+from openllm_core._typing_compat import LiteralBackend
 from openllm_core.utils import first_not_none, getenv, is_vllm_available

-if t.TYPE_CHECKING:
-  from ._runners import Runner as _Runner
-
-P = ParamSpec('P')
-
+__all__ = ['Runner']
 logger = logging.getLogger(__name__)

-
 def Runner(
-  model_name: str,
-  ensure_available: bool = True,
-  init_local: bool = False,
-  backend: LiteralBackend | None = None,
-  llm_config: openllm.LLMConfig | None = None,
-  **attrs: t.Any,
-) -> _Runner[t.Any, t.Any]:
-  """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
-
-  > [!WARNING]
-  > This method is now deprecated and in favor of 'openllm.LLM'
-
-  ```python
-  runner = openllm.Runner("dolly-v2")
-
-  @svc.on_startup
-  def download():
-    runner.download_model()
-  ```
-
-  if `init_local=True` (For development workflow), it will also enable `ensure_available`.
-  Default value of `ensure_available` is None. If set then use that given value, otherwise fallback to the aforementioned behaviour.
-
-  Args:
-    model_name: Supported model name from 'openllm models'
-    ensure_available: If True, it will download the model if it is not available. If False, it will skip downloading the model.
-                      If False, make sure the model is available locally. Default to True, and openllm.LLM will always check if models
-                      are available locally. based on generated tag.
-    backend: The given Runner implementation one choose for this Runner. If `OPENLLM_BACKEND` is set, it will respect it.
-    llm_config: Optional ``openllm.LLMConfig`` to initialise this ``openllm.LLMRunner``.
-    init_local: If True, it will initialize the model locally. This is useful if you want to run the model locally. (Symmetrical to bentoml.Runner.init_local())
-    **attrs: The rest of kwargs will then be passed to the LLM. Refer to the LLM documentation for the kwargs behaviour
-  """
-  from ._llm import LLM
-
-  if llm_config is None:
-    llm_config = openllm.AutoConfig.for_model(model_name)
-  if not ensure_available:
-    logger.warning(
-      "'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation."
-    )
+  model_name: str, ensure_available: bool = True, #
+  init_local: bool = False, backend: LiteralBackend | None = None, #
+  llm_config: openllm.LLMConfig | None = None, **attrs: t.Any,
+):
+  if llm_config is None: llm_config = openllm.AutoConfig.for_model(model_name)
+  if not ensure_available: logger.warning("'ensure_available=False' won't have any effect as LLM will always check to download the model on initialisation.")
  model_id = attrs.get('model_id', os.getenv('OPENLLM_MODEL_ID', llm_config['default_id']))
-  _RUNNER_MSG = f'''\
+  warnings.warn(f'''\
  Using 'openllm.Runner' is now deprecated. Make sure to switch to the following syntax:

  ```python
@@ -70,22 +26,11 @@ def Runner(
  @svc.api(...)
  async def chat(input: str) -> str:
    async for it in llm.generate_iterator(input): print(it)
-  ```
-    '''
-  warnings.warn(_RUNNER_MSG, DeprecationWarning, stacklevel=2)
+  ```''', DeprecationWarning, stacklevel=2)
  attrs.update(
    {
-      'model_id': model_id,
-      'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)),
-      'serialisation': getenv(
-        'serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']
-      ),
+      'model_id': model_id, 'quantize': getenv('QUANTIZE', var=['QUANTISE'], default=attrs.get('quantize', None)), #
+      'serialisation': getenv('serialization', default=attrs.get('serialisation', llm_config['serialisation']), var=['SERIALISATION']),
    }
  )
-
-  backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'))
-  llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, embedded=init_local, **attrs)
-  return llm.runner
-
-
-__all__ = ['Runner']
+  return openllm.LLM(backend=first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'), llm_config=llm_config, embedded=init_local, **attrs).runner
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -47,23 +47,17 @@ ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]

@attr.define(slots=False, repr=False, init=False)
 class LLM(t.Generic[M, T]):
-  async def generate(
-    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
-  ) -> GenerationOutput:
-    if adapter_name is not None and self.__llm_backend__ != 'pt':
-      raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
+  async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
+    if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
    config = self.config.model_construct_env(**attrs)
    texts, token_ids = [[]] * config['n'], [[]] * config['n']
-    final_result = None
    async for result in self.generate_iterator(
      prompt, prompt_token_ids, stop, stop_token_ids, request_id, adapter_name, **config.model_dump(flatten=True)
    ):
      for output in result.outputs:
        texts[output.index].append(output.text)
        token_ids[output.index].extend(output.token_ids)
-      final_result = result
-    if final_result is None:
-      raise RuntimeError('No result is returned.')
+    if (final_result := result) is None: raise RuntimeError('No result is returned.')
    return final_result.with_options(
      prompt=prompt,
      outputs=[
@@ -72,13 +66,9 @@ class LLM(t.Generic[M, T]):
      ],
    )

-  async def generate_iterator(
-    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
-  ) -> t.AsyncGenerator[GenerationOutput, None]:
+  async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
    from bentoml._internal.runner.runner_handle import DummyRunnerHandle
-
-    if adapter_name is not None and self.__llm_backend__ != 'pt':
-      raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
+    if adapter_name is not None and self.__llm_backend__ != 'pt': raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')

    if isinstance(self.runner._runner_handle, DummyRunnerHandle):
      if os.getenv('BENTO_PATH') is not None:
@@ -87,14 +77,12 @@ class LLM(t.Generic[M, T]):
        self.runner.init_local(quiet=True)
    config = self.config.model_construct_env(**attrs)

-    if stop_token_ids is None: stop_token_ids = []
+    stop_token_ids = stop_token_ids or []
    eos_token_id = attrs.get('eos_token_id', config['eos_token_id'])
-    if eos_token_id is not None:
-      if not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
-      stop_token_ids.extend(eos_token_id)
-    if config['eos_token_id'] and config['eos_token_id'] not in stop_token_ids: stop_token_ids.append(config['eos_token_id'])
-    if self.tokenizer.eos_token_id not in stop_token_ids:
-      stop_token_ids.append(self.tokenizer.eos_token_id)
+    if eos_token_id and not isinstance(eos_token_id, list): eos_token_id = [eos_token_id]
+    stop_token_ids.extend(eos_token_id or [])
+    if (config_eos := config['eos_token_id']) and config_eos not in stop_token_ids: stop_token_ids.append(config_eos)
+    if self.tokenizer.eos_token_id not in stop_token_ids: stop_token_ids.append(self.tokenizer.eos_token_id)
    if stop is None:
      stop = set()
    elif isinstance(stop, str):
@@ -102,20 +90,16 @@ class LLM(t.Generic[M, T]):
    else:
      stop = set(stop)
    for tid in stop_token_ids:
-      if tid:
-        stop.add(self.tokenizer.decode(tid))
+      if tid: stop.add(self.tokenizer.decode(tid))

    if prompt_token_ids is None:
-      if prompt is None:
-        raise ValueError('Either prompt or prompt_token_ids must be specified.')
+      if prompt is None: raise ValueError('Either prompt or prompt_token_ids must be specified.')
      prompt_token_ids = self.tokenizer.encode(prompt)

    request_id = gen_random_uuid() if request_id is None else request_id
    previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
    try:
-      generator = self.runner.generate_iterator.async_stream(
-        prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True)
-      )
+      generator = self.runner.generate_iterator.async_stream(prompt_token_ids, request_id, stop=list(stop), adapter_name=adapter_name, **config.model_dump(flatten=True))
    except Exception as err:
      raise RuntimeError(f'Failed to start generation task: {err}') from err

@@ -134,18 +118,11 @@ class LLM(t.Generic[M, T]):

  # NOTE: If you are here to see how generate_iterator and generate works, see above.
  # The below are mainly for internal implementation that you don't have to worry about.
-  _model_id: str
-  _revision: t.Optional[str]
+  _model_id: str; _revision: t.Optional[str] #
  _quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
-  _quantise: t.Optional[LiteralQuantise]
-  _model_decls: t.Tuple[t.Any, ...]
-  __model_attrs: t.Dict[str, t.Any]
-  __tokenizer_attrs: t.Dict[str, t.Any]
-  _tag: bentoml.Tag
-  _adapter_map: t.Optional[AdapterMap]
-  _serialisation: LiteralSerialisation
-  _local: bool
-  _max_model_len: t.Optional[int]
+  _quantise: t.Optional[LiteralQuantise]; _model_decls: t.Tuple[t.Any, ...]; __model_attrs: t.Dict[str, t.Any] #
+  __tokenizer_attrs: t.Dict[str, t.Any]; _tag: bentoml.Tag; _adapter_map: t.Optional[AdapterMap] #
+  _serialisation: LiteralSerialisation; _local: bool; _max_model_len: t.Optional[int] #

  __llm_dtype__: t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']] = 'auto'
  __llm_torch_dtype__: 'torch.dtype' = None
@@ -180,12 +157,7 @@ class LLM(t.Generic[M, T]):
  ):
    torch_dtype = attrs.pop('torch_dtype', None)  # backward compatible
    if torch_dtype is not None:
-      warnings.warn(
-        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
-        DeprecationWarning,
-        stacklevel=3,
-      )
-      dtype = torch_dtype
+      warnings.warn('The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3); dtype = torch_dtype
    _local = False
    if validate_is_path(model_id): model_id, _local = resolve_filepath(model_id), True
    backend = getenv('backend', default=backend)
@@ -291,7 +263,7 @@ class LLM(t.Generic[M, T]):
      if is_vllm_available():
        return 'vllm'
      elif is_ctranslate_available():
-        return 'ctranslate'  # XXX: base OpenLLM image should always include vLLM
+        return 'ctranslate'
    elif is_ctranslate_available():
      return 'ctranslate'
    else:
@@ -449,8 +421,7 @@ def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
        config_file = hf_hub_download(path_or_adapter_id, PEFT_CONFIG_NAME)
      except Exception as err:
        raise ValueError(f"Can't find '{PEFT_CONFIG_NAME}' at '{path_or_adapter_id}'") from err
-    with open(config_file, 'r') as file:
-      resolved_config = orjson.loads(file.read())
+    with open(config_file, 'r') as file: resolved_config = orjson.loads(file.read())
    _peft_type = resolved_config['peft_type'].lower()
    if _peft_type not in resolved: resolved[_peft_type] = ()
    resolved[_peft_type] += (_AdapterTuple((path_or_adapter_id, name, resolved_config)),)
--- a/openllm-python/src/openllm/_quantisation.py
+++ b/openllm-python/src/openllm/_quantisation.py
@@ -1,13 +1,8 @@
 from __future__ import annotations
-
 from openllm_core.exceptions import MissingDependencyError
 from openllm_core.utils import is_autoawq_available, is_autogptq_available, is_bitsandbytes_available
-
-
 def infer_quantisation_config(llm, quantise, **attrs):
-  import torch
-  import transformers
-
+  import torch, transformers
  # 8 bit configuration
  int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
  int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
@@ -85,25 +80,19 @@ def infer_quantisation_config(llm, quantise, **attrs):

  # NOTE: Quantization setup quantize is a openllm.LLM feature, where we can quantize the model with bitsandbytes or quantization aware training.
  if not is_bitsandbytes_available():
-    raise RuntimeError(
-      'Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\''
-    )
+    raise RuntimeError('Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with \'pip install "openllm[fine-tune]"\'')
  if quantise == 'int8':
    quantisation_config = create_int8_config(int8_skip_modules)
  elif quantise == 'int4':
    quantisation_config = create_int4_config()
  elif quantise == 'gptq':
    if not is_autogptq_available():
-      raise MissingDependencyError(
-        "GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'"
-      )
+      raise MissingDependencyError("GPTQ requires 'auto-gptq' and 'optimum>=0.12' to be installed. Do it with 'pip install \"openllm[gptq]\"'")
    else:
      quantisation_config = create_gptq_config()
  elif quantise == 'awq':
    if not is_autoawq_available():
-      raise MissingDependencyError(
-        "AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'."
-      )
+      raise MissingDependencyError("AWQ requires 'auto-awq' to be installed. Do it with 'pip install \"openllm[awq]\"'.")
    else:
      quantisation_config = create_awq_config()
  else:
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -1,66 +1,43 @@
 from __future__ import annotations
-import logging
-import typing as t
-
+import logging, typing as t
 import _service_vars as svars
-
-import bentoml
-import openllm
+import bentoml, openllm
 from openllm_core._schemas import MessageParam
 from bentoml.io import JSON, Text

 logger = logging.getLogger(__name__)
-
 llm = openllm.LLM[t.Any, t.Any](
-  model_id=svars.model_id,
-  model_tag=svars.model_tag,
-  serialisation=svars.serialization,
-  adapter_map=svars.adapter_map,
-  trust_remote_code=svars.trust_remote_code,
+  model_id=svars.model_id, model_tag=svars.model_tag, adapter_map=svars.adapter_map, #
+  serialisation=svars.serialization, trust_remote_code=svars.trust_remote_code,
 )
 svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
-
 llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)

-
@svc.api(
  route='/v1/generate',
-  input=JSON.from_sample(llm_model_class.examples()),
-  output=JSON.from_sample(openllm.GenerationOutput.examples()),
+  input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), #
 )
-async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]:
-  return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()
-
+async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump()

@svc.api(
  route='/v1/generate_stream',
-  input=JSON.from_sample(llm_model_class.examples()),
-  output=Text(content_type='text/event-stream'),
+  input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), #
 )
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
    yield f'data: {it.model_dump_json()}\n\n'
  yield 'data: [DONE]\n\n'

-
 _Metadata = openllm.MetadataOutput(
-  timeout=llm.config['timeout'],
-  model_name=llm.config['model_name'],
-  backend=llm.__llm_backend__,
-  model_id=llm.model_id,
+  timeout=llm.config['timeout'], model_name=llm.config['model_name'], #
+  backend=llm.__llm_backend__, model_id=llm.model_id, #
  configuration=llm.config.model_dump_json().decode(),
 )

-
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
-def metadata_v1(_: str) -> openllm.MetadataOutput:
-  return _Metadata
-
-
-class MessagesConverterInput(t.TypedDict):
-  add_generation_prompt: bool
-  messages: t.List[t.Dict[str, t.Any]]
+def metadata_v1(_: str) -> openllm.MetadataOutput: return _Metadata

+class MessagesConverterInput(t.TypedDict): add_generation_prompt: bool; messages: t.List[t.Dict[str, t.Any]]

@svc.api(
  route='/v1/helpers/messages',
@@ -69,18 +46,14 @@ class MessagesConverterInput(t.TypedDict):
      add_generation_prompt=False,
      messages=[
        MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
-        MessageParam(role='user', content='Hi there!'),
-        MessageParam(role='assistant', content='Yes?'),
+        MessageParam(role='user', content='Hi there!'), MessageParam(role='assistant', content='Yes?'), #
      ],
    )
  ),
  output=Text(),
 )
 def helpers_messages_v1(message: MessagesConverterInput) -> str:
-  add_generation_prompt = message['add_generation_prompt']
-  messages = message['messages']
+  add_generation_prompt, messages = message['add_generation_prompt'], message['messages']
  return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)

-
-# HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
-openllm.mount_entrypoints(svc, llm)
+openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
--- a/openllm-python/src/openllm/_service_vars.py
+++ b/openllm-python/src/openllm/_service_vars.py
@@ -1,9 +1,2 @@
 import os, orjson, openllm_core.utils as coreutils
-
-model_id, model_tag, adapter_map, serialization, trust_remote_code = (
-  os.environ['OPENLLM_MODEL_ID'],
-  None,
-  orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))),
-  os.getenv('OPENLLM_SERIALIZATION', default='safetensors'),
-  coreutils.check_bool_env('TRUST_REMOTE_CODE', False),
-)
+model_id, model_tag, adapter_map, serialization, trust_remote_code = os.environ['OPENLLM_MODEL_ID'], None, orjson.loads(os.getenv('OPENLLM_ADAPTER_MAP', orjson.dumps(None))), os.getenv('OPENLLM_SERIALIZATION', default='safetensors'), coreutils.check_bool_env('TRUST_REMOTE_CODE', False)
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -4,44 +4,33 @@ import psutil, bentoml, openllm_core.utils as coreutils
 from bentoml._internal.resource import get_resource, system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS

+__all__ = ['CascadingResourceStrategy', 'get_resource']
 logger = logging.getLogger(__name__)

-
 def _strtoul(s: str) -> int:
  # Return -1 or positive integer sequence string starts with.
-  if not s:
-    return -1
+  if not s: return -1
  idx = 0
  for idx, c in enumerate(s):
-    if not (c.isdigit() or (idx == 0 and c in '+-')):
-      break
-    if idx + 1 == len(s):
-      idx += 1  # noqa: PLW2901
+    if not (c.isdigit() or (idx == 0 and c in '+-')): break
+    if idx + 1 == len(s): idx += 1  # noqa: PLW2901
  # NOTE: idx will be set via enumerate
  return int(s[:idx]) if idx > 0 else -1
-
-
 def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]:
-  rcs: list[str] = []
+  rcs = []
  for elem in lst.split(','):
    # Repeated id results in empty set
-    if elem in rcs:
-      return []
+    if elem in rcs: return []
    # Anything other but prefix is ignored
-    if not elem.startswith(prefix):
-      break
+    if not elem.startswith(prefix): break
    rcs.append(elem)
  return rcs
-
-
 def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None:
  if respect_env:
    spec = os.environ.get('CUDA_VISIBLE_DEVICES', default_var)
-    if not spec:
-      return None
+    if not spec: return None
  else:
-    if default_var is None:
-      raise ValueError('spec is required to be not None when parsing spec.')
+    if default_var is None: raise ValueError('spec is required to be not None when parsing spec.')
    spec = default_var

  if spec.startswith('GPU-'):
@@ -55,64 +44,52 @@ def _parse_cuda_visible_devices(default_var: str | None = None, respect_env: boo
  for el in spec.split(','):
    x = _strtoul(el.strip())
    # Repeated ordinal results in empty set
-    if x in rc:
-      return []
+    if x in rc: return []
    # Negative value aborts the sequence
-    if x < 0:
-      break
+    if x < 0: break
    rc.append(x)
  return [str(i) for i in rc]
-
-
 def _raw_device_uuid_nvml() -> list[str] | None:
  from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer

  try:
    nvml_h = CDLL('libnvidia-ml.so.1')
  except Exception:
-    warnings.warn('Failed to find nvidia binding', stacklevel=3)
-    return None
+    warnings.warn('Failed to find nvidia binding', stacklevel=3); return None

  rc = nvml_h.nvmlInit()
  if rc != 0:
-    warnings.warn("Can't initialize NVML", stacklevel=3)
-    return None
+    warnings.warn("Can't initialize NVML", stacklevel=3); return None
  dev_count = c_int(-1)
  rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
  if rc != 0:
-    warnings.warn('Failed to get available device from system.', stacklevel=3)
-    return None
-  uuids: list[str] = []
+    warnings.warn('Failed to get available device from system.', stacklevel=3); return None
+  uuids = []
  for idx in range(dev_count.value):
    dev_id = c_void_p()
    rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
    if rc != 0:
-      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3)
-      return None
+      warnings.warn(f'Failed to get device handle for {idx}', stacklevel=3); return None
    buf_len = 96
    buf = create_string_buffer(buf_len)
    rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
    if rc != 0:
-      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3)
-      return None
+      warnings.warn(f'Failed to get device UUID for {idx}', stacklevel=3); return None
    uuids.append(buf.raw.decode('ascii').strip('\0'))
  del nvml_h
  return uuids

-
 class _ResourceMixin:
  @staticmethod
  def from_system(cls) -> list[str]:
    visible_devices = _parse_cuda_visible_devices()
    if visible_devices is None:
      if cls.resource_id == 'amd.com/gpu':
-        if not psutil.LINUX:
-          if coreutils.DEBUG:
-            logger.debug('AMD GPUs is currently only supported on Linux.')
-          return []
+        if not psutil.LINUX: return []
        # ROCm does not currently have the rocm_smi wheel.
        # So we need to use the ctypes bindings directly.
        # we don't want to use CLI because parsing is a pain.
+        # TODO: Use tinygrad/gpuctypes
        sys.path.append('/opt/rocm/libexec/rocm_smi')
        try:
          from ctypes import byref, c_uint32
@@ -122,8 +99,7 @@ class _ResourceMixin:

          device_count = c_uint32(0)
          ret = rocmsmi.rsmi_num_monitor_devices(byref(device_count))
-          if ret == rsmi_status_t.RSMI_STATUS_SUCCESS:
-            return [str(i) for i in range(device_count.value)]
+          if ret == rsmi_status_t.RSMI_STATUS_SUCCESS: return [str(i) for i in range(device_count.value)]
          return []
        # In this case the binary is not found, returning empty list
        except (ModuleNotFoundError, ImportError):
@@ -140,59 +116,43 @@ class _ResourceMixin:
        except (ImportError, RuntimeError, AttributeError):
          return []
    return visible_devices
-
  @staticmethod
  def from_spec(cls, spec) -> list[str]:
    if isinstance(spec, int):
-      if spec in (-1, 0):
-        return []
-      if spec < -1:
-        raise ValueError('Spec cannot be < -1.')
+      if spec in (-1, 0): return []
+      if spec < -1: raise ValueError('Spec cannot be < -1.')
      return [str(i) for i in range(spec)]
    elif isinstance(spec, str):
-      if not spec:
-        return []
-      if spec.isdigit():
-        spec = ','.join([str(i) for i in range(_strtoul(spec))])
+      if not spec: return []
+      if spec.isdigit(): spec = ','.join([str(i) for i in range(_strtoul(spec))])
      return _parse_cuda_visible_devices(spec, respect_env=False)
    elif isinstance(spec, list):
      return [str(x) for x in spec]
    else:
-      raise TypeError(
-        f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead."
-      )
-
+      raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.")
  @staticmethod
  def validate(cls, val: list[t.Any]) -> None:
    if cls.resource_id == 'amd.com/gpu':
-      raise RuntimeError(
-        "AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'"
-      )
+      raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'")
    if not all(isinstance(i, str) for i in val):
      raise ValueError('Input list should be all string type.')

    try:
      from cuda import cuda
-
      err, *_ = cuda.cuInit(0)
-      if err != cuda.CUresult.CUDA_SUCCESS:
-        raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
      # correctly parse handle
      for el in val:
        if el.startswith(('GPU-', 'MIG-')):
          uuids = _raw_device_uuid_nvml()
-          if uuids is None:
-            raise ValueError('Failed to parse available GPUs UUID')
-          if el not in uuids:
-            raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
+          if uuids is None: raise ValueError('Failed to parse available GPUs UUID')
+          if el not in uuids: raise ValueError(f'Given UUID {el} is not found with available UUID (available: {uuids})')
        elif el.isdigit():
          err, _ = cuda.cuDeviceGet(int(el))
-          if err != cuda.CUresult.CUDA_SUCCESS:
-            raise ValueError(f'Failed to get device {el}')
+          if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f'Failed to get device {el}')
    except (ImportError, RuntimeError):
      pass

-
 def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[bentoml.Resource[t.List[str]]]:
  return types.new_class(
    name,
@@ -201,22 +161,16 @@ def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[
    lambda ns: ns.update(
      {
        'resource_id': resource_kind,
-        'from_spec': classmethod(_ResourceMixin.from_spec),
-        'from_system': classmethod(_ResourceMixin.from_system),
-        'validate': classmethod(_ResourceMixin.validate),
-        '__repr_keys__': property(lambda _: {'resource_id'}),
-        '__doc__': inspect.cleandoc(docstring),
-        '__module__': 'openllm._strategies',
+        'from_spec': classmethod(_ResourceMixin.from_spec), 'from_system': classmethod(_ResourceMixin.from_system), #
+        'validate': classmethod(_ResourceMixin.validate), '__repr_keys__': property(lambda _: {'resource_id'}), #
+        '__doc__': inspect.cleandoc(docstring), '__module__': 'openllm._strategies', #
      }
    ),
  )
-
-
 NvidiaGpuResource = _make_resource_class(
  'NvidiaGpuResource',
  'nvidia.com/gpu',
  '''NVIDIA GPU resource.
-
    This is a modified version of internal's BentoML's NvidiaGpuResource
    where it respects and parse CUDA_VISIBLE_DEVICES correctly.''',
 )
@@ -224,73 +178,53 @@ AmdGpuResource = _make_resource_class(
  'AmdGpuResource',
  'amd.com/gpu',
  '''AMD GPU resource.
-
    Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to
    ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.''',
 )

-
 class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
  @classmethod
  def get_worker_count(cls, runnable_class, resource_request, workers_per_resource):
-    if resource_request is None:
-      resource_request = system_resources()
+    if resource_request is None: resource_request = system_resources()
    # use NVIDIA
    kind = 'nvidia.com/gpu'
    nvidia_req = get_resource(resource_request, kind)
-    if nvidia_req is not None:
-      return 1
+    if nvidia_req is not None: return 1
    # use AMD
    kind = 'amd.com/gpu'
    amd_req = get_resource(resource_request, kind, validate=False)
-    if amd_req is not None:
-      return 1
+    if amd_req is not None: return 1
    # use CPU
    cpus = get_resource(resource_request, 'cpu')
    if cpus is not None and cpus > 0:
-      if 'cpu' not in runnable_class.SUPPORTED_RESOURCES:
-        logger.warning('No known supported resource available for %s, falling back to using CPU.', runnable_class)
-
      if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
-        if isinstance(workers_per_resource, float) and workers_per_resource < 1.0:
-          raise ValueError('Fractional CPU multi threading support is not yet supported.')
+        if isinstance(workers_per_resource, float) and workers_per_resource < 1.0: raise ValueError('Fractional CPU multi threading support is not yet supported.')
        return int(workers_per_resource)
      return math.ceil(cpus) * workers_per_resource
-
    # this should not be reached by user since we always read system resource as default
-    raise ValueError(
-      f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.'
-    )
-
+    raise ValueError(f'No known supported resource available for {runnable_class}. Please check your resource request. Leaving it blank will allow BentoML to use system resources.')
  @classmethod
  def get_worker_env(cls, runnable_class, resource_request, workers_per_resource, worker_index):
    cuda_env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
    disabled = cuda_env in ('', '-1')
-    environ: dict[str, t.Any] = {}
+    environ = {}

-    if resource_request is None:
-      resource_request = system_resources()
+    if resource_request is None: resource_request = system_resources()
    # use NVIDIA
    kind = 'nvidia.com/gpu'
    typ = get_resource(resource_request, kind)
    if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
      if disabled:
-        logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
-        environ['CUDA_VISIBLE_DEVICES'] = cuda_env
-        return environ
+        environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
      environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
-      logger.debug('Environ for worker %s: %s', worker_index, environ)
      return environ
    # use AMD
    kind = 'amd.com/gpu'
    typ = get_resource(resource_request, kind, validate=False)
    if typ is not None and len(typ) > 0 and kind in runnable_class.SUPPORTED_RESOURCES:
      if disabled:
-        logger.debug('CUDA_VISIBLE_DEVICES is disabled, %s will not be using GPU.', worker_index)
-        environ['CUDA_VISIBLE_DEVICES'] = cuda_env
-        return environ
+        environ['CUDA_VISIBLE_DEVICES'] = cuda_env; return environ
      environ['CUDA_VISIBLE_DEVICES'] = cls.transpile_workers_to_cuda_envvar(workers_per_resource, typ, worker_index)
-      logger.debug('Environ for worker %s: %s', worker_index, environ)
      return environ
    # use CPU
    cpus = get_resource(resource_request, 'cpu')
@@ -298,25 +232,17 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
      environ['CUDA_VISIBLE_DEVICES'] = '-1'  # disable gpu
      if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
        thread_count = math.ceil(cpus)
-        for thread_env in THREAD_ENVS:
-          environ[thread_env] = os.environ.get(thread_env, str(thread_count))
-        logger.debug('Environ for worker %s: %s', worker_index, environ)
+        for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, str(thread_count))
        return environ
-      for thread_env in THREAD_ENVS:
-        environ[thread_env] = os.environ.get(thread_env, '1')
+      for thread_env in THREAD_ENVS: environ[thread_env] = os.environ.get(thread_env, '1')
      return environ
    return environ
-
  @staticmethod
  def transpile_workers_to_cuda_envvar(workers_per_resource, gpus, worker_index):
    # Convert given workers_per_resource to correct CUDA_VISIBLE_DEVICES string.
    if isinstance(workers_per_resource, float):
-      # NOTE: We hit this branch when workers_per_resource is set to
-      # float, for example 0.5 or 0.25
-      if workers_per_resource > 1:
-        raise ValueError(
-          "Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case."
-        )
+      # NOTE: We hit this branch when workers_per_resource is set to float, for example 0.5 or 0.25
+      if workers_per_resource > 1: raise ValueError('workers_per_resource > 1 is not supported.')
      # We are round the assigned resource here. This means if workers_per_resource=.4
      # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
      assigned_resource_per_worker = round(1 / workers_per_resource)
@@ -327,21 +253,12 @@ class CascadingResourceStrategy(bentoml.Strategy, coreutils.ReprMixin):
          worker_index,
          assigned_resource_per_worker,
        )
-        raise IndexError(
-          f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
-        )
-      assigned_gpu = gpus[
-        assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)
-      ]
+        raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].")
+      assigned_gpu = gpus[assigned_resource_per_worker * worker_index : assigned_resource_per_worker * (worker_index + 1)]
      dev = ','.join(assigned_gpu)
    else:
      idx = worker_index // workers_per_resource
      if idx >= len(gpus):
-        raise ValueError(
-          f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}'
-        )
+        raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}')
      dev = str(gpus[idx])
    return dev
-
-
-__all__ = ['CascadingResourceStrategy', 'get_resource']
--- a/openllm-python/src/openllm/bundle/init.py
+++ b/openllm-python/src/openllm/bundle/init.py
@@ -4,7 +4,6 @@ from openllm_core._typing_compat import LiteralVersionStrategy
 from openllm_core.exceptions import OpenLLMException
 from openllm_core.utils.lazy import VersionInfo, LazyModule

-_OWNER, _REPO = 'bentoml', 'openllm'
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
 class RefResolver:
  git_hash: str = attr.field()
@@ -17,7 +16,7 @@ class RefResolver:
    if strategy_or_version is None or strategy_or_version == 'release':
      try:
        from ghapi.all import GhApi
-        ghapi = GhApi(owner=_OWNER, repo=_REPO, authenticate=False)
+        ghapi = GhApi(owner='bentoml', repo='openllm', authenticate=False)
        meta = ghapi.repos.get_latest_release()
        git_hash = ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha']
      except Exception as err:
@@ -35,6 +34,4 @@ __lazy = LazyModule(
  {'_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options']},
  extra_objects={'RefResolver': RefResolver}
 )
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -1,15 +1,7 @@
-# mypy: disable-error-code="misc"
 from __future__ import annotations
-import importlib.metadata
-import logging
-import os
-from pathlib import Path
-
-import orjson
+import importlib.metadata, logging, os, pathlib
+import bentoml, orjson, openllm_core
 from simple_di import Provide, inject
-
-import bentoml
-import openllm_core
 from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
@@ -17,7 +9,7 @@ from openllm_core.utils import SHOW_CODEGEN, check_bool_env, pkg
 logger = logging.getLogger(__name__)

 OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD'
-_service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py'
+_service_file = pathlib.Path(os.path.abspath(__file__)).parent.parent / '_service.py'
 _SERVICE_VARS = '''import orjson;model_id,model_tag,adapter_map,serialization,trust_remote_code='{__model_id__}','{__model_tag__}',orjson.loads("""{__model_adapter_map__}"""),'{__model_serialization__}',{__model_trust_remote_code__}'''

 def build_editable(path, package='openllm'):
@@ -28,7 +20,7 @@ def build_editable(path, package='openllm'):
  from build.env import IsolatedEnvBuilder
  module_location = pkg.source_locations(package)
  if not module_location: raise RuntimeError('Could not find the source location of OpenLLM.')
-  pyproject_path = Path(module_location).parent.parent / 'pyproject.toml'
+  pyproject_path = pathlib.Path(module_location).parent.parent / 'pyproject.toml'
  if os.path.isfile(pyproject_path.__fspath__()):
    with IsolatedEnvBuilder() as env:
      builder = ProjectBuilder(pyproject_path.parent)
@@ -70,12 +62,9 @@ def create_bento(
  labels = dict(llm.identifying_params)
  labels.update(
    {
-      '_type': llm.llm_type, '_framework': llm.__llm_backend__, 'start_name': llm.config['start_name'],
-      'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
-      **{
-        f'{package.replace("-","_")}_version': importlib.metadata.version(package)
-        for package in {'openllm', 'openllm-core', 'openllm-client'}
-      },
+      '_type': llm.llm_type, '_framework': llm.__llm_backend__,
+      'start_name': llm.config['start_name'], 'base_name_or_path': llm.model_id, 'bundler': 'openllm.bundle',
+      **{f'{package.replace("-","_")}_version': importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
    }
  )
  if adapter_map: labels.update(adapter_map)
@@ -83,18 +72,15 @@ def create_bento(
  logger.debug("Building Bento '%s' with model backend '%s'", bento_tag, llm.__llm_backend__)
  logger.debug('Generating service vars %s (dir=%s)', llm.model_id, llm_fs.getsyspath('/'))
  script = f"# fmt: off\n# GENERATED BY 'openllm build {llm.model_id}'. DO NOT EDIT\n" + _SERVICE_VARS.format(
-    __model_id__=llm.model_id,
-    __model_tag__=str(llm.tag),
-    __model_adapter_map__=orjson.dumps(adapter_map).decode(),
-    __model_serialization__=llm.config['serialisation'],
+    __model_id__=llm.model_id, __model_tag__=str(llm.tag), #
+    __model_adapter_map__=orjson.dumps(adapter_map).decode(), __model_serialization__=llm.config['serialisation'], #
    __model_trust_remote_code__=str(llm.trust_remote_code),
  )
  if SHOW_CODEGEN: logger.info('Generated _service_vars.py:\n%s', script)
  llm_fs.writetext('_service_vars.py', script)
  with open(_service_file.__fspath__(), 'r') as f: service_src = f.read()
  llm_fs.writetext(llm.config['service_name'], service_src)
-
-  bento = bentoml.Bento.create(
+  return bentoml.Bento.create(
    version=bento_tag.version,
    build_ctx=llm_fs.getsyspath('/'),
    build_config=BentoBuildConfig(
@@ -108,6 +94,4 @@ def create_bento(
      python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map),
      docker=construct_docker_options(llm, llm_fs, quantize, adapter_map, dockerfile_template, _serialisation),
    ),
-  )
-
-  return bento.save(bento_store=_bento_store, model_store=_model_store)
+  ).save(bento_store=_bento_store, model_store=_model_store)
--- a/openllm-python/src/openllm/client.py
+++ b/openllm-python/src/openllm/client.py
@@ -1,10 +1,2 @@
-def __dir__():
-  import openllm_client as _client
-
-  return sorted(dir(_client))
-
-
-def __getattr__(it):
-  import openllm_client as _client
-
-  return getattr(_client, it)
+def __dir__(): import openllm_client as _client; return sorted(dir(_client))
+def __getattr__(it): import openllm_client as _client; return getattr(_client, it)
--- a/openllm-python/src/openllm/entrypoints/init.py
+++ b/openllm-python/src/openllm/entrypoints/init.py
@@ -1,20 +1,11 @@
 import importlib
-
 from openllm_core.utils import LazyModule

 _import_structure = {'openai': [], 'hf': [], 'cohere': []}
-
-
 def mount_entrypoints(svc, llm):
  for module_name in _import_structure:
    module = importlib.import_module(f'.{module_name}', __name__)
    svc = module.mount_to_svc(svc, llm)
  return svc
-
-
-__lazy = LazyModule(
-  __name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints}
-)
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects={'mount_entrypoints': mount_entrypoints})
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
--- a/openllm-python/src/openllm/entrypoints/cohere.py
+++ b/openllm-python/src/openllm/entrypoints/cohere.py
@@ -1,17 +1,11 @@
 from __future__ import annotations
-import functools
-import json
-import logging
-import traceback
+import functools, json, logging, traceback
 from http import HTTPStatus
-
 import orjson
 from starlette.applications import Starlette
 from starlette.responses import JSONResponse, StreamingResponse
 from starlette.routing import Route
-
 from openllm_core.utils import DEBUG, converter, gen_random_uuid
-
 from ._openapi import add_schema_definitions, append_schemas, get_generator
 from ..protocol.cohere import (
  Chat,
@@ -54,41 +48,31 @@ schemas = get_generator(
 logger = logging.getLogger(__name__)


-def jsonify_attr(obj):
-  return json.dumps(converter.unstructure(obj))
-
+def jsonify_attr(obj): return json.dumps(converter.unstructure(obj))

 def error_response(status_code, message):
  return JSONResponse(converter.unstructure(CohereErrorResponse(text=message)), status_code=status_code.value)

-
 async def check_model(request, model):
-  if request.model is None or request.model == model:
-    return None
+  if request.model is None or request.model == model: return None
  return error_response(
    HTTPStatus.NOT_FOUND,
    f"Model '{request.model}' does not exists. Try 'GET /v1/models' to see current running models.",
  )

-
 def mount_to_svc(svc, llm):
  app = Starlette(
    debug=True,
    routes=[
-      Route(
-        '/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']
-      ),
-      Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
      Route('/schema', endpoint=lambda req: schemas.OpenAPIResponse(req), include_in_schema=False),
+      Route('/v1/chat', endpoint=functools.partial(cohere_chat, llm=llm), name='cohere_chat', methods=['POST']),
+      Route('/v1/generate', endpoint=functools.partial(cohere_generate, llm=llm), name='cohere_generate', methods=['POST']),
    ],
  )
  mount_path = '/cohere'

  svc.mount_asgi_app(app, path=mount_path)
-  return append_schemas(
-    svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG
-  )
-
+  return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append', inject=DEBUG)

@add_schema_definitions
 async def cohere_generate(req, llm):
@@ -181,7 +165,6 @@ def _transpile_cohere_chat_messages(request: CohereChatRequest) -> list[dict[str
  messages.append({'role': 'user', 'content': request.message})
  return messages

-
@add_schema_definitions
 async def cohere_chat(req, llm):
  json_str = await req.body()
--- a/openllm-python/src/openllm/entrypoints/hf.py
+++ b/openllm-python/src/openllm/entrypoints/hf.py
@@ -1,14 +1,10 @@
-import functools
-import logging
+import functools, logging
 from http import HTTPStatus
-
 import orjson
 from starlette.applications import Starlette
 from starlette.responses import JSONResponse
 from starlette.routing import Route
-
 from openllm_core.utils import converter
-
 from ._openapi import add_schema_definitions, append_schemas, get_generator
 from ..protocol.hf import AgentRequest, AgentResponse, HFErrorResponse

@@ -25,7 +21,6 @@ schemas = get_generator(
 )
 logger = logging.getLogger(__name__)

-
 def mount_to_svc(svc, llm):
  app = Starlette(
    debug=True,
@@ -39,13 +34,8 @@ def mount_to_svc(svc, llm):
  svc.mount_asgi_app(app, path=mount_path)
  return append_schemas(svc, schemas.get_schema(routes=app.routes, mount_path=mount_path), tags_order='append')

-
 def error_response(status_code, message):
-  return JSONResponse(
-    converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)),
-    status_code=status_code.value,
-  )
-
+  return JSONResponse(converter.unstructure(HFErrorResponse(message=message, error_code=status_code.value)), status_code=status_code.value)

@add_schema_definitions
 async def hf_agent(req, llm):
@@ -60,18 +50,14 @@ async def hf_agent(req, llm):
  stop = request.parameters.pop('stop', ['\n'])
  try:
    result = await llm.generate(request.inputs, stop=stop, **request.parameters)
-    return JSONResponse(
-      converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value
-    )
+    return JSONResponse(converter.unstructure([AgentResponse(generated_text=result.outputs[0].text)]), status_code=HTTPStatus.OK.value)
  except Exception as err:
    logger.error('Error while generating: %s', err)
    return error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'Error while generating (Check server log).')

-
@add_schema_definitions
 def hf_adapters(req, llm):
-  if not llm.has_adapters:
-    return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
+  if not llm.has_adapters: return error_response(HTTPStatus.NOT_FOUND, 'No adapters found.')
  return JSONResponse(
    {
      adapter_tuple[1]: {'adapter_name': k, 'adapter_type': adapter_tuple[0].peft_type.value}
--- a/openllm-python/src/openllm/exceptions.py
+++ b/openllm-python/src/openllm/exceptions.py
@@ -1,10 +1,7 @@
 from openllm_core.exceptions import (
-  Error as Error,
-  FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError,
-  ForbiddenAttributeError as ForbiddenAttributeError,
-  GpuNotAvailableError as GpuNotAvailableError,
+  Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError, #
+  ForbiddenAttributeError as ForbiddenAttributeError, GpuNotAvailableError as GpuNotAvailableError, #
+  OpenLLMException as OpenLLMException, ValidationError as ValidationError, #
  MissingAnnotationAttributeError as MissingAnnotationAttributeError,
  MissingDependencyError as MissingDependencyError,
-  OpenLLMException as OpenLLMException,
-  ValidationError as ValidationError,
 )
--- a/openllm-python/src/openllm/protocol/init.py
+++ b/openllm-python/src/openllm/protocol/init.py
@@ -5,11 +5,6 @@ import typing as t
 from openllm_core.utils import LazyModule

 _import_structure: dict[str, list[str]] = {'openai': [], 'cohere': [], 'hf': []}
-
-if t.TYPE_CHECKING:
-  from . import cohere as cohere, hf as hf, openai as openai
-
+if t.TYPE_CHECKING: from . import cohere as cohere, hf as hf, openai as openai
 __lazy = LazyModule(__name__, os.path.abspath('__file__'), _import_structure)
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
+__all__, __dir__, __getattr__ = __lazy.__all__, __lazy.__dir__, __lazy.__getattr__
--- a/openllm-python/src/openllm/utils.py
+++ b/openllm-python/src/openllm/utils.py
@@ -1,36 +1,16 @@
 import functools, importlib.metadata, openllm_core
-
 __all__ = ['generate_labels', 'available_devices', 'device_count']
-
-
 def generate_labels(llm):
  return {
-    'backend': llm.__llm_backend__,
-    'framework': 'openllm',
-    'model_name': llm.config['model_name'],
-    'architecture': llm.config['architecture'],
-    'serialisation': llm._serialisation,
+    'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], #
+    'architecture': llm.config['architecture'], 'serialisation': llm._serialisation, #
    **{package: importlib.metadata.version(package) for package in {'openllm', 'openllm-core', 'openllm-client'}},
  }
-
-
-def available_devices():
-  from ._strategies import NvidiaGpuResource
-
-  return tuple(NvidiaGpuResource.from_system())
-
-
+def available_devices(): from ._strategies import NvidiaGpuResource; return tuple(NvidiaGpuResource.from_system())
@functools.lru_cache(maxsize=1)
-def device_count() -> int:
-  return len(available_devices())
-
-
+def device_count() -> int: return len(available_devices())
 def __dir__():
-  coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')])
-  return sorted(__all__) + sorted(list(coreutils))
-
-
+  coreutils = set(dir(openllm_core.utils)) | set([it for it in openllm_core.utils._extras if not it.startswith('_')]); return sorted(__all__) + sorted(list(coreutils))
 def __getattr__(it):
-  if hasattr(openllm_core.utils, it):
-    return getattr(openllm_core.utils, it)
+  if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it)
  raise AttributeError(f'module {__name__} has no attribute {it}')