OpenLLM/tools/update-config-stubs.py

#!/usr/bin/env python3
from __future__ import annotations
import os, sys
from pathlib import Path

# currently we are assuming the indentatio level is 2 for comments
START_COMMENT = f'# {os.path.basename(__file__)}: start\n'
END_COMMENT = f'# {os.path.basename(__file__)}: stop\n'
START_SPECIAL_COMMENT = f'# {os.path.basename(__file__)}: special start\n'
END_SPECIAL_COMMENT = f'# {os.path.basename(__file__)}: special stop\n'
START_ATTRS_COMMENT = f'# {os.path.basename(__file__)}: attrs start\n'
END_ATTRS_COMMENT = f'# {os.path.basename(__file__)}: attrs stop\n'

ROOT = Path(__file__).parent.parent
_TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'

sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
from openllm_core._configuration import GenerationConfig, ModelSettings, PeftType, SamplingParams
from openllm_core.utils import codegen

def process_annotations(annotations: str) -> str:
  if 'NotRequired' in annotations: return annotations[len('NotRequired['):-1]
  elif 'Required' in annotations: return annotations[len('Required['):-1]
  else: return annotations

_value_docstring = {
    'default_id': '''Return the default model to use when using 'openllm start <model_id>'.
        This could be one of the keys in 'self.model_ids' or custom users model.

        This field is required when defining under '__config__'.
        ''',
    'model_ids': '''A list of supported pretrained models tag for this given runnable.

        For example:
            For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base",
                                            "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"]

        This field is required when defining under '__config__'.
        ''',
    'architecture': '''The model architecture that is supported by this LLM.

        Note that any model weights within this architecture generation can always be run and supported by this LLM.

        For example:
            For GPT-NeoX implementation, it is based on GptNeoXForCausalLM, which supports dolly-v2, stablelm:

            ```bash
            openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b
            ```''',
    'default_backend': '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''',
    'url': 'The resolved url for this LLMConfig.',
    'requires_gpu': 'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.',
    'trust_remote_code': 'Whether to always trust remote code',
    'service_name': "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"",
    'requirements': 'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.',
    'model_type': 'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"',
    'name_type': '''The default name typed for this model. "dasherize" will convert the name to lowercase and
        replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both
        `model_name` and `start_name` must be specified.''',
    'model_name': 'The normalized version of __openllm_start_name__, determined by __openllm_name_type__',
    'start_name': 'Default name to be used with `openllm start`',
    'env': 'A EnvVarMixin instance for this LLMConfig.',
    'timeout': 'The default timeout to be set for this given LLM.',
    'workers_per_resource': '''The number of workers per resource. This is used to determine the number of workers to use for this model.
        For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then
        OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource.

        See StarCoder for more advanced usage. See
        https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy for more details.

        By default, it is set to 1.
        ''',
    'fine_tune_strategies': 'The fine-tune strategies for this given LLM.',
    'tokenizer_class': 'Optional tokenizer class for this given LLM. See Llama for example.',
}

_transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'}

def main() -> int:
  with _TARGET_FILE.open('r') as f:
    processed = f.readlines()

  start_idx, end_idx = processed.index(' ' * 2 + START_COMMENT), processed.index(' ' * 2 + END_COMMENT)
  start_stub_idx, end_stub_idx = processed.index(' ' * 4 + START_SPECIAL_COMMENT), processed.index(' ' * 4 + END_SPECIAL_COMMENT)
  start_attrs_idx, end_attrs_idx = processed.index(' ' * 4 + START_ATTRS_COMMENT), processed.index(' ' * 4 + END_ATTRS_COMMENT)

  # NOTE: inline stubs __config__ attrs representation
  special_attrs_lines: list[str] = []
  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
    special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n")
  # NOTE: inline stubs for _ConfigAttr type stubs
  config_attr_lines: list[str] = []
  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
    config_attr_lines.extend(
        [' ' * 4 + line for line in [f'__openllm_{keys}__: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))} = Field(None)\n', f"'''{_value_docstring[keys]}'''\n",]])
  # NOTE: inline runtime __getitem__ overload process
  lines: list[str] = []
  lines.append(' ' * 2 + '# NOTE: ModelSettings arguments\n')
  for keys, ForwardRef in codegen.get_annotations(ModelSettings).items():
    lines.extend(
        [' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}: ...\n",]])
  # special case variables: generation_class, extras, sampling_class
  lines.append(' ' * 2 + '# NOTE: generation_class, sampling_class and extras arguments\n')
  lines.extend([
      ' ' * 2 + line for line in [
          '@overload\n',
          "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n",
          '@overload\n',
          "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n",
          '@overload\n',
          "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n",
      ]
  ])
  lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n')
  generation_config_anns = codegen.get_annotations(GenerationConfig)
  for keys, type_pep563 in generation_config_anns.items():
    lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]])
  lines.append(' ' * 2 + '# NOTE: SamplingParams arguments\n')
  for keys, type_pep563 in codegen.get_annotations(SamplingParams).items():
    if keys not in generation_config_anns:
      lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",]])
  lines.append(' ' * 2 + '# NOTE: PeftType arguments\n')
  for keys in PeftType._member_names_:
    lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]])

  processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT] + processed[end_attrs_idx + 1:start_stub_idx] + [
      ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT
  ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT] + processed[end_idx + 1:]
  with _TARGET_FILE.open('w') as f:
    f.writelines(processed)
  return 0

if __name__ == '__main__': raise SystemExit(main())