OpenLLM/openllm-python/src/_openllm_tiny/_entrypoint.py

from __future__ import annotations

import os, traceback, io, pathlib, sys, fs, click, enum, importlib, importlib.metadata, inflection, bentoml, orjson, openllm, openllm_core as core, platform, tarfile, typing as t
from ._helpers import recommended_instance_type
from openllm_core.utils import (
  DEBUG,
  DEBUG_ENV_VAR,
  QUIET_ENV_VAR,
  OPENLLM_DEV_BUILD,
  SHOW_CODEGEN,
  check_bool_env,
  compose,
  first_not_none,
  pkg,
  gen_random_uuid,
  get_debug_mode,
  get_quiet_mode,
  normalise_model_name,
)
from openllm_core._typing_compat import LiteralQuantise, LiteralSerialisation, LiteralDtype, get_literal_args
from . import _termui as termui

if sys.version_info >= (3, 11):
  import tomllib
else:
  import tomli as tomllib

OPENLLM_FIGLET = """
 ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
 ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝.
"""
_PACKAGE_NAME = 'openllm'
_TINY_PATH = pathlib.Path(os.path.abspath(__file__)).parent
_SERVICE_FILE = _TINY_PATH / '_service.py'
_SERVICE_README = _TINY_PATH / 'service.md'
_SERVICE_VARS = """\
# fmt: off
# GENERATED BY '{__command__}'. DO NOT EDIT
import orjson,openllm_core.utils as coreutils
model_id='{__model_id__}'
revision=orjson.loads(coreutils.getenv('revision',default={__model_revision__}))
quantise=coreutils.getenv('quantize',default='{__model_quantise__}',var=['QUANTISE'])
serialisation=coreutils.getenv('serialization',default='{__model_serialization__}',var=['SERIALISATION'])
dtype=coreutils.getenv('dtype', default='{__model_dtype__}',var=['TORCH_DTYPE'])
trust_remote_code=coreutils.check_bool_env("TRUST_REMOTE_CODE",{__model_trust_remote_code__})
max_model_len=orjson.loads(coreutils.getenv('max_model_len',default=orjson.dumps({__max_model_len__})))
gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization',default=orjson.dumps({__gpu_memory_utilization__}),var=['GPU_MEMORY_UTILISATION']))
services_config=orjson.loads(coreutils.getenv('services_config',default={__services_config__}))
"""
HF_HUB_DISABLE_PROGRESS_BARS = 'HF_HUB_DISABLE_PROGRESS_BARS'


class ItemState(enum.Enum):
  NOT_FOUND = 'NOT_FOUND'
  ADDED = 'ADDED'
  EXISTS = 'EXISTS'
  OVERWRITE = 'OVERWRITE'


@click.group(context_settings=termui.CONTEXT_SETTINGS, name='openllm')
@click.version_option(
  None,
  '--version',
  '-v',
  package_name=_PACKAGE_NAME,
  message=f'{_PACKAGE_NAME}, %(version)s\nPython ({platform.python_implementation()}) {platform.python_version()}',
)
def cli() -> None:
  """\b
   ██████╗ ██████╗ ███████╗███╗   ██╗██╗     ██╗     ███╗   ███╗
  ██╔═══██╗██╔══██╗██╔════╝████╗  ██║██║     ██║     ████╗ ████║
  ██║   ██║██████╔╝█████╗  ██╔██╗ ██║██║     ██║     ██╔████╔██║
  ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║██║     ██║     ██║╚██╔╝██║
  ╚██████╔╝██║     ███████╗██║ ╚████║███████╗███████╗██║ ╚═╝ ██║
   ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝╚══════╝╚══════╝╚═╝     ╚═╝.

  \b
  Self-Hosting LLMs Made Easy
  """


def optimization_decorator(fn: t.Callable[..., t.Any]):
  optimization = [
    click.option(
      '--concurrency',
      type=int,
      envvar='CONCURRENCY',
      help='See https://docs.bentoml.com/en/latest/guides/concurrency.html#concurrency for more information.',
      show_envvar=True,
      default=None,
    ),
    click.option('--timeout', type=int, default=360000, help='Timeout for the model executor in seconds'),
    click.option(
      '--dtype',
      type=str,
      envvar='DTYPE',
      default='auto',
      help="Optional dtype for casting tensors for running inference ['float16', 'float32', 'bfloat16']. Default to auto for infering dtype based on available accelerator.",
    ),
    click.option(
      '--quantise',
      '--quantize',
      'quantise',
      type=str,
      default=None,
      envvar='QUANTIZE',
      show_envvar=True,
      help="""Quantisation options for this LLM.

    The following quantisation strategies are supported:

    - ``gptq``: ``GPTQ`` [quantisation](https://arxiv.org/abs/2210.17323)

    - ``awq``: ``AWQ`` [AWQ: Activation-aware Weight Quantisation](https://arxiv.org/abs/2306.00978)

    - ``squeezellm``: ``SqueezeLLM`` [SqueezeLLM: Dense-and-Sparse Quantisation](https://arxiv.org/abs/2306.07629)

    > [!NOTE] that the model must be pre-quantised to ensure correct loading, as all aforementioned quantization scheme are post-training quantization.
    """,
    ),
    click.option(
      '--serialisation',
      '--serialization',
      'serialisation',
      type=click.Choice(get_literal_args(LiteralSerialisation)),
      default=None,
      show_default=True,
      show_envvar=True,
      envvar='SERIALIZATION',
      help="""Serialisation format for loading LLM. Make sure to check HF repository for the correct format.

    Currently the following strategies are supported:

    - ``safetensors``: This will use safetensors format, which is synonymous to ``safe_serialization=True``.

    > [!NOTE] Safetensors might not work for older models, and you can always fallback to ``legacy`` if needed.

    - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors.
    """,
    ),
    click.option(
      '--max-model-len',
      '--max_model_len',
      'max_model_len',
      type=int,
      default=None,
      help='Maximum sequence length for the model. If not specified, we will use the default value from the model config.',
    ),
    click.option(
      '--gpu-memory-utilization',
      '--gpu_memory_utilization',
      'gpu_memory_utilization',
      default=0.9,
      help='The percentage of GPU memory to be used for the model executor',
    ),
    click.option(
      '--trust-remote-code',
      '--trust_remote_code',
      'trust_remote_code',
      type=bool,
      is_flag=True,
      default=False,
      show_envvar=True,
      envvar='TRUST_REMOTE_CODE',
      help='If model from HuggingFace requires custom code, pass this to enable remote code execution. If the model is a private model, make sure to also pass this argument such that OpenLLM can determine model architecture to load.',
    ),
  ]
  return compose(*optimization)(fn)


def shared_decorator(fn: t.Callable[..., t.Any]):
  shared = [
    click.argument(
      'model_id',
      type=click.STRING,
      metavar='[REMOTE_REPO/MODEL_ID | /path/to/local/model | bentomodel_tag]',
      required=True,
    ),
    click.option(
      '--revision',
      '--bentomodel-version',
      '--model-version',
      'model_version',
      type=click.STRING,
      default=None,
      help='Optional model revision to for this LLM. If this is a private model, specify this alongside model_id will be used as a bentomodel tag. If using in conjunction with a HF model id, this will be a specific revision, code branch, or a commit id on HF repo.',
    ),
    click.option(
      '--debug',
      '--verbose',
      type=bool,
      default=False,
      show_envvar=True,
      is_flag=True,
      envvar='DEBUG',
      help='whether to enable verbose logging (For more fine-grained control, set DEBUG to number instead of this flag.).',
    ),
  ]
  return compose(*shared)(fn)


@cli.command(name='start')
@shared_decorator
@click.option('--port', type=int, default=3000, help='Port to serve the LLM. Default to 3000.')
@optimization_decorator
def start_command(
  model_id: str,
  model_version: str | None,
  timeout: int,
  concurrency: int | None,
  port: int,
  quantise: LiteralQuantise | None,
  serialisation: LiteralSerialisation | None,
  dtype: LiteralDtype | t.Literal['auto', 'float'],
  max_model_len: int | None,
  gpu_memory_utilization: float,
  trust_remote_code: bool,
  debug: bool,
):
  """Start any LLM as a REST server.

  \b
  ```bash
  $ openllm start microsoft/Phi-3-mini-4k-instruct --trust-remote-code
  ```
  """
  from _bentoml_impl.server import serve_http
  from bentoml._internal.service.loader import load
  from bentoml._internal.log import configure_server_logging

  configure_server_logging()

  try:
    # NOTE: if given model_id is a private model (assuming this is packaged into a bentomodel), then we can use it directly
    bentomodel = bentoml.models.get(model_id.lower())
    model_id = bentomodel.path
    if not trust_remote_code:
      trust_remote_code = True
  except (ValueError, bentoml.exceptions.NotFound):
    bentomodel = None

  llm_config = core.AutoConfig.from_id(model_id, trust_remote_code=trust_remote_code)
  if serialisation is None:
    termui.warning(
      f"Serialisation format is not specified. Defaulting to '{llm_config['serialisation']}'. Your model might not work with this format. Make sure to explicitly specify the serialisation format."
    )
    serialisation = llm_config['serialisation']

  # TODO: support LoRA adapters
  os.environ.update({
    QUIET_ENV_VAR: str(openllm.utils.get_quiet_mode()),
    DEBUG_ENV_VAR: str(debug) or str(openllm.utils.get_debug_mode()),
    HF_HUB_DISABLE_PROGRESS_BARS: str(not openllm.utils.get_debug_mode()),
    'MODEL_ID': model_id,
    # handling custom revision if users specify --revision alongside with model_id
    # this should work only if bentomodel is None
    'REVISION': orjson.dumps(first_not_none(model_version, default=None)).decode(),
    'SERIALIZATION': serialisation,
    'OPENLLM_CONFIG': llm_config.model_dump_json(),
    'DTYPE': dtype,
    'TRUST_REMOTE_CODE': str(trust_remote_code),
    'GPU_MEMORY_UTILIZATION': orjson.dumps(gpu_memory_utilization).decode(),
    'SERVICES_CONFIG': orjson.dumps(
      # XXX: right now we just enable GPU by default. will revisit this if we decide to support TPU later.
      dict(
        resources={'gpu': len(openllm.utils.available_devices())},
        traffic=dict(timeout=timeout, concurrency=concurrency),
      )
    ).decode(),
  })
  if max_model_len is not None:
    os.environ['MAX_MODEL_LEN'] = orjson.dumps(max_model_len).decode()
  if quantise:
    os.environ['QUANTIZE'] = str(quantise)

  working_dir = os.path.abspath(os.path.dirname(__file__))
  if sys.path[0] != working_dir:
    sys.path.insert(0, working_dir)
  load('.', working_dir=working_dir).inject_config()
  serve_http(
    '.', working_dir=working_dir, reload=check_bool_env('RELOAD', default=False), development_mode=DEBUG, port=port
  )


def get_package_version(_package: str) -> str:
  try:
    return importlib.import_module('._version', _package).__version__
  except ModuleNotFoundError:
    return importlib.metadata.version(_package)


def build_sdist(target_path: str, package: t.Literal['openllm', 'openllm-core', 'openllm-client']):
  import tomli_w

  if not check_bool_env(OPENLLM_DEV_BUILD, default=False):
    return None

  module_location = pkg.source_locations(inflection.underscore(package))
  if not module_location:
    raise RuntimeError(
      f'Could not find the source location of {package}. Make sure to set "{OPENLLM_DEV_BUILD}=False" if you are not using development build.'
    )

  package_path = pathlib.Path(module_location)
  package_version = get_package_version(package)
  project_path = package_path.parent.parent

  if not (project_path / 'pyproject.toml').exists():
    termui.warning(
      f'Custom "{package}" is detected. For a Bento to use the same build at serving time, at your custom "{package}" build to pip packages list under your "bentofile.yaml". i.e: "packages=[\'git+https://github.com/bentoml/openllm.git@bc0be03\']"'
    )
    return
  termui.debug(
    f'"{package}" is installed in "editable" mode; building "{package}" distribution with local code base. the built tar will be included in generated bento.'
  )

  def exclude_pycache(tarinfo: tarfile.TarInfo):
    if '__pycache__' in tarinfo.name or tarinfo.name.endswith(('.pyc', '.pyo')):
      return None
    return tarinfo

  with open(project_path / 'pyproject.toml', 'rb') as f:
    pyproject_toml = tomllib.load(f)
  pyproject_toml['project']['version'] = package_version
  if 'dynamic' in pyproject_toml['project'] and 'version' in pyproject_toml['project']['dynamic']:
    pyproject_toml['project']['dynamic'].remove('version')

  pyproject_io = io.BytesIO()
  tomli_w.dump(pyproject_toml, pyproject_io)
  # make a tarball of this package-version
  base_name = f'{package}-{package_version}'
  sdist_filename = f'{base_name}.tar.gz'
  files_to_include = ['src', 'LICENSE.md', 'README.md']

  if not pathlib.Path(target_path).exists():
    pathlib.Path(target_path).mkdir(parents=True, exist_ok=True)
  with tarfile.open(pathlib.Path(target_path, sdist_filename), 'w:gz') as tar:
    for file in files_to_include:
      tar.add(project_path / file, arcname=f'{base_name}/{file}', filter=exclude_pycache)
    if package == 'openllm':
      tar.add(project_path / 'CHANGELOG.md', arcname=f'{base_name}/CHANGELOG.md', filter=exclude_pycache)
    tarinfo = tar.gettarinfo(project_path / 'pyproject.toml', arcname=f'{base_name}/pyproject.toml')
    tarinfo.size = pyproject_io.tell()
    pyproject_io.seek(0)
    tar.addfile(tarinfo, pyproject_io)
  return sdist_filename


@cli.command(name='build', context_settings={'token_normalize_func': inflection.underscore})
@shared_decorator
@click.option(
  '--bento-version',
  type=str,
  default=None,
  help='Optional bento version for this BentoLLM. Default is the the model revision.',
)
@click.option(
  '--bento-tag',
  type=str,
  default=None,
  help='Optional bento version for this BentoLLM. Default is the the model revision.',
)
@click.option('--overwrite', is_flag=True, help='Overwrite existing Bento for given LLM if it already exists.')
@optimization_decorator
@click.option(
  '-o',
  '--output',
  type=click.Choice(['tag', 'default']),
  default='default',
  show_default=True,
  help="Output log format. '-o tag' to display only bento tag.",
)
def build_command(
  model_id: str,
  model_version: str | None,
  bento_version: str | None,
  bento_tag: str | None,
  overwrite: bool,
  timeout: int,
  concurrency: int | None,
  quantise: LiteralQuantise | None,
  serialisation: LiteralSerialisation | None,
  dtype: LiteralDtype | t.Literal['auto', 'float'],
  max_model_len: int | None,
  gpu_memory_utilization: float,
  output: t.Literal['default', 'tag'],
  trust_remote_code: bool,
  debug: bool,
):
  """Package a given LLM into a BentoLLM.

  \b
  ```bash
  $ openllm build microsoft/Phi-3-mini-4k-instruct --trust-remote-code
  ```

  \b
  > [!NOTE]
  > To run a container built from this Bento with GPU support, make sure
  > to have https://github.com/NVIDIA/nvidia-container-toolkit install locally.

  \b
  > [!NOTE]
  > For private model, make sure to save it to the bentomodel store first. See https://docs.bentoml.com/en/latest/guides/model-store.html#model-store for more information
  """
  import transformers
  from bentoml._internal.log import configure_logging
  from bentoml._internal.configuration import set_quiet_mode
  from bentoml._internal.configuration.containers import BentoMLContainer
  from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions

  if output == 'tag':
    set_quiet_mode(True)
    configure_logging()

  try:
    # NOTE: if given model_id is a private model (assuming the model is packaged into a bentomodel), then we can use it directly
    bentomodel = bentoml.models.get(model_id.lower())
    model_id = bentomodel.path
    _revision = bentomodel.tag.version
    if not trust_remote_code:
      trust_remote_code = True
  except (ValueError, bentoml.exceptions.NotFound):
    bentomodel, _revision = None, None

  llm_config = core.AutoConfig.from_id(model_id, trust_remote_code=trust_remote_code)
  transformers_config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
  commit_hash = getattr(transformers_config, '_commit_hash', None)

  # in case that user specify the revision here,
  generated_uuid = gen_random_uuid()
  _revision = first_not_none(_revision, model_version, commit_hash, default=generated_uuid)

  model_revision = None
  if bentomodel is None and model_version is not None:
    # this is when --revision|--model-version is specified alongside with HF model-id, then we set it in the generated service_vars.py, then we let users manage this themselves
    model_revision = model_version

  if serialisation is None:
    termui.warning(
      f"Serialisation format is not specified. Defaulting to '{llm_config['serialisation']}'. Your model might not work with this format. Make sure to explicitly specify the serialisation format."
    )
    serialisation = llm_config['serialisation']

  if bento_tag is None:
    _bento_version = first_not_none(bento_version, default=_revision)
    generated_tag = bentoml.Tag.from_taglike(
      f'{normalise_model_name(model_id)}-service:{_bento_version}'.lower().strip()
    )
  else:
    generated_tag = bentoml.Tag.from_taglike(bento_tag)

  state = ItemState.NOT_FOUND
  try:
    bento = bentoml.get(generated_tag)
    if overwrite:
      bentoml.delete(generated_tag)
      state = ItemState.OVERWRITE
      raise bentoml.exceptions.NotFound(f'Rebuilding existing Bento {generated_tag}') from None
    state = ItemState.EXISTS
  except bentoml.exceptions.NotFound:
    if state != ItemState.OVERWRITE:
      state = ItemState.ADDED

    labels = {'runtime': 'vllm'}
    # XXX: right now we just enable GPU by default. will revisit this if we decide to support TPU later.
    service_config = dict(
      resources=dict(
        gpu=len(openllm.utils.available_devices()),
        gpu_type=recommended_instance_type(model_id, bentomodel, serialisation),
      ),
      traffic=dict(timeout=timeout, concurrency=concurrency),
    )
    with fs.open_fs(f'temp://{gen_random_uuid()}') as llm_fs, fs.open_fs(
      f'temp://wheels_{gen_random_uuid()}'
    ) as wheel_fs:
      termui.debug(f'Generating service vars {model_id} (dir={llm_fs.getsyspath("/")})')
      script = _SERVICE_VARS.format(
        __command__=' '.join(['openllm', *sys.argv[1:]]),
        __model_id__=model_id,
        __model_revision__=orjson.dumps(model_revision),
        __model_quantise__=quantise,
        __model_dtype__=dtype,
        __model_serialization__=serialisation,
        __model_trust_remote_code__=trust_remote_code,
        __max_model_len__=max_model_len,
        __gpu_memory_utilization__=gpu_memory_utilization,
        __services_config__=orjson.dumps(service_config),
      )
      if SHOW_CODEGEN:
        termui.info(f'\n{"=" * 27}\nGenerated _service_vars.py:\n\n{script}\n{"=" * 27}\n')
      llm_fs.writetext('_service_vars.py', script)

      with _SERVICE_README.open('r') as f:
        service_readme = f.read()
      service_readme = service_readme.format(model_id=model_id)
      with _SERVICE_FILE.open('r') as f:
        service_src = f.read()
        llm_fs.writetext(llm_config['service_name'], service_src)

      built_wheels = [build_sdist(wheel_fs.getsyspath('/'), p) for p in ('openllm-core', 'openllm-client', 'openllm')]

      bento = bentoml.Bento.create(
        version=generated_tag.version,
        build_ctx=llm_fs.getsyspath('/'),
        build_config=BentoBuildConfig(
          service=f"{llm_config['service_name']}:LLMService",
          name=generated_tag.name,
          labels=labels,
          models=[ModelSpec.from_item({'tag': str(bentomodel.tag), 'alias': bentomodel.tag.name})]
          if bentomodel is not None
          else [],
          description=service_readme,
          include=list(llm_fs.walk.files()),
          exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'],
          python=PythonOptions(
            packages=['scipy', 'bentoml[tracing]>=1.2.16', 'openllm>=0.5'],
            pip_args='--no-color --progress-bar off',
            wheels=[wheel_fs.getsyspath(f"/{i.split('/')[-1]}") for i in built_wheels]
            if all(i for i in built_wheels)
            else None,
            lock_packages=False,
          ),
          docker=DockerOptions(
            python_version='3.11',
            setup_script=str(_TINY_PATH / 'setup.sh'),
            dockerfile_template=str(_TINY_PATH / 'Dockerfile.j2'),
            system_packages=['git'],
          ),
        ),
      ).save(bento_store=BentoMLContainer.bento_store.get(), model_store=BentoMLContainer.model_store.get())
  except Exception as err:
    traceback.print_exc()
    raise click.ClickException('Exception caught while building BentoLLM:\n' + str(err)) from err

  if output == 'tag':
    termui.echo(f'__tag__:{bento.tag}')
    return

  if not get_quiet_mode():
    if state != ItemState.EXISTS:
      termui.info(f"Successfully built Bento '{bento.tag}'.\n")
    elif not overwrite:
      termui.warning(f"Bento for '{model_id}' already exists [{bento}]. To overwrite it pass '--overwrite'.\n")
    if not (debug or get_debug_mode()):
      termui.echo(OPENLLM_FIGLET)
      termui.echo('📖 Next steps:\n', nl=False)
      termui.echo(f'☁️  Deploy to BentoCloud:\n  $ bentoml deploy {bento.tag} -n ${{DEPLOYMENT_NAME}}\n', nl=False)
      termui.echo(
        f'☁️  Update existing deployment on BentoCloud:\n  $ bentoml deployment update --bento {bento.tag} ${{DEPLOYMENT_NAME}}\n',
        nl=False,
      )
      termui.echo(f'🐳 Containerize BentoLLM:\n  $ bentoml containerize {bento.tag} --opt progress=plain\n', nl=False)

  return bento


if __name__ == '__main__':
  cli()