From fad4186dbca92049af1885fe45eb3987ce18f5b5 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Sun, 12 Nov 2023 01:02:27 -0500
Subject: [PATCH] feat(server): helpers endpoints for conversation format
 (#613)

* feat: add support for helpers conversation conversion endpoint

also correct schema generation for openllm client

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update clients to reuse `openllm-core` logics

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: add changelog

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 changelog.d/613.feature.md                    |  3 +
 mypy.ini                                      |  2 +-
 openllm-client/pyproject.toml                 |  9 +--
 openllm-client/src/openllm_client/_http.py    | 29 +++----
 openllm-client/src/openllm_client/_schemas.py | 75 ++++---------------
 .../src/openllm_client/_typing_compat.py      | 28 ++-----
 openllm-client/src/openllm_client/_utils.py   | 13 +++-
 openllm-client/src/openllm_client/_utils.pyi  | 49 ++++++++++++
 .../src/openllm_core/utils/__init__.py        | 63 ++++++++--------
 openllm-python/src/openllm/_service.py        | 30 ++++++++
 openllm-python/src/openllm/utils/__init__.py  |  2 +-
 11 files changed, 160 insertions(+), 143 deletions(-)
 create mode 100644 changelog.d/613.feature.md
 create mode 100644 openllm-client/src/openllm_client/_utils.pyi

diff --git a/changelog.d/613.feature.md b/changelog.d/613.feature.md
new file mode 100644
index 00000000..999a39ff
--- /dev/null
+++ b/changelog.d/613.feature.md
@@ -0,0 +1,3 @@
+OpenLLM server now provides a helpers endpoint to help easily create new prompt and other utilities in the future
+
+`/v1/helpers/messages` will format a list of messages into the correct chat messages given the chat model
diff --git a/mypy.ini b/mypy.ini
index 60315ebd..9648d989 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,4 +7,4 @@ warn_unused_configs = True
 ignore_missing_imports = true
 check_untyped_defs = true
 warn_unreachable = true
-files = openllm-client/src/openllm_client/__init__.pyi, openllm-core/src/openllm_core/_typing_compat.py, openllm-client/src/openllm_client/_typing_compat.py, openllm-python/src/openllm/__init__.pyi, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/serialisation/__init__.pyi
+files = openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-core/src/openllm_core/_typing_compat.py, openllm-client/src/openllm_client/_typing_compat.py, openllm-python/src/openllm/__init__.pyi, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/serialisation/__init__.pyi
diff --git a/openllm-client/pyproject.toml b/openllm-client/pyproject.toml
index fa7baf09..238dbbe1 100644
--- a/openllm-client/pyproject.toml
+++ b/openllm-client/pyproject.toml
@@ -13,14 +13,9 @@ authors = [
 ]
 dynamic = ['readme', 'version']
 classifiers = [
-  "Development Status :: 5 - Production/Stable",
-  "Environment :: GPU :: NVIDIA CUDA",
-  "Environment :: GPU :: NVIDIA CUDA :: 12",
-  "Environment :: GPU :: NVIDIA CUDA :: 11.8",
-  "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+  "Development Status :: 4 - Beta",
   "License :: OSI Approved :: Apache Software License",
   "Topic :: Scientific/Engineering",
-  "Topic :: Scientific/Engineering :: Artificial Intelligence",
   "Topic :: Software Development :: Libraries",
   "Operating System :: OS Independent",
   "Intended Audience :: Developers",
@@ -57,7 +52,7 @@ keywords = [
   "PyTorch",
   "Transformers",
 ]
-dependencies = ["orjson", "httpx", "attrs>=23.1.0", "cattrs>=23.1.0", 'distro', 'anyio']
+dependencies = ["openllm-core", "httpx", "distro", "anyio"]
 license = "Apache-2.0"
 name = "openllm-client"
 requires-python = ">=3.8"
diff --git a/openllm-client/src/openllm_client/_http.py b/openllm-client/src/openllm_client/_http.py
index 9052e7a2..83d61d88 100644
--- a/openllm-client/src/openllm_client/_http.py
+++ b/openllm-client/src/openllm_client/_http.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-import asyncio
 import importlib.metadata
 import logging
 import os
@@ -7,7 +6,7 @@ import typing as t
 
 import attr
 
-from ._schemas import MetadataOutput
+from ._schemas import Metadata
 from ._schemas import Response
 from ._schemas import StreamingResponse
 from ._shim import MAX_RETRIES
@@ -28,7 +27,7 @@ def _address_converter(addr: str):
 class HTTPClient(Client):
   _api_version: str = 'v1'
   _verify: bool = True
-  __metadata: MetadataOutput | None = None
+  __metadata: Metadata | None = None
   __config: dict[str, t.Any] | None = None
 
   def __repr__(self):
@@ -55,9 +54,7 @@ class HTTPClient(Client):
   def _metadata(self):
     if self.__metadata is None:
       path = f'/{self._api_version}/metadata'
-      self.__metadata = self._post(
-        path, response_cls=MetadataOutput, json={}, options={'max_retries': self._max_retries}
-      )
+      self.__metadata = self._post(path, response_cls=Metadata, json={}, options={'max_retries': self._max_retries})
     return self.__metadata
 
   @property
@@ -93,6 +90,7 @@ class HTTPClient(Client):
       f'/{self._api_version}/generate',
       response_cls=Response,
       json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name),
+      options={'max_retries': self._max_retries},
     )
 
   def generate_stream(
@@ -118,6 +116,7 @@ class HTTPClient(Client):
       f'/{self._api_version}/generate_stream',
       response_cls=Response,
       json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name),
+      options={'max_retries': self._max_retries},
       stream=True,
     )
 
@@ -126,7 +125,7 @@ class HTTPClient(Client):
 class AsyncHTTPClient(AsyncClient):
   _api_version: str = 'v1'
   _verify: bool = True
-  __metadata: MetadataOutput | None = None
+  __metadata: Metadata | None = None
   __config: dict[str, t.Any] | None = None
 
   def __repr__(self):
@@ -147,20 +146,10 @@ class AsyncHTTPClient(AsyncClient):
     return super()._build_auth_headers()
 
   @property
-  def _loop(self) -> asyncio.AbstractEventLoop:
-    try:
-      return asyncio.get_running_loop()
-    except RuntimeError:
-      return asyncio.get_event_loop()
-
-  @property
-  async def _metadata(self) -> t.Awaitable[MetadataOutput]:
+  async def _metadata(self) -> t.Awaitable[Metadata]:
     if self.__metadata is None:
       self.__metadata = await self._post(
-        f'/{self._api_version}/metadata',
-        response_cls=MetadataOutput,
-        json={},
-        options={'max_retries': self._max_retries},
+        f'/{self._api_version}/metadata', response_cls=Metadata, json={}, options={'max_retries': self._max_retries}
       )
     return self.__metadata
 
@@ -198,6 +187,7 @@ class AsyncHTTPClient(AsyncClient):
       f'/{self._api_version}/generate',
       response_cls=Response,
       json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name),
+      options={'max_retries': self._max_retries},
     )
 
   async def generate_stream(
@@ -228,6 +218,7 @@ class AsyncHTTPClient(AsyncClient):
       f'/{self._api_version}/generate_stream',
       response_cls=Response,
       json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name),
+      options={'max_retries': self._max_retries},
       stream=True,
     ):
       yield response_chunk
diff --git a/openllm-client/src/openllm_client/_schemas.py b/openllm-client/src/openllm_client/_schemas.py
index 235e4896..09a49d53 100644
--- a/openllm-client/src/openllm_client/_schemas.py
+++ b/openllm-client/src/openllm_client/_schemas.py
@@ -2,15 +2,24 @@ from __future__ import annotations
 import typing as t
 
 import attr
-import cattr
 import orjson
 
+from openllm_core._schemas import CompletionChunk as CompletionChunk
+from openllm_core._schemas import GenerationOutput as Response  # backward compatibility
+from openllm_core._schemas import _SchemaMixin as _SchemaMixin
+
 from ._utils import converter
 
 
-# XXX: sync with openllm-core/src/openllm_core/_schemas.py
+__all__ = ['Response', 'CompletionChunk', 'Metadata', 'StreamingResponse']
+
+
 @attr.define
-class MetadataOutput:
+class Metadata(_SchemaMixin):
+  """NOTE: Metadata is a modified version of the original MetadataOutput from openllm-core.
+
+  The configuration is now structured into a dictionary for easy of use."""
+
   model_id: str
   timeout: int
   model_name: str
@@ -20,7 +29,7 @@ class MetadataOutput:
   system_message: t.Optional[str]
 
 
-def _structure_metadata(data: t.Dict[str, t.Any], cls: type[MetadataOutput]) -> MetadataOutput:
+def _structure_metadata(data: t.Dict[str, t.Any], cls: type[Metadata]) -> Metadata:
   try:
     configuration = orjson.loads(data['configuration'])
     generation_config = configuration.pop('generation_config')
@@ -41,58 +50,11 @@ def _structure_metadata(data: t.Dict[str, t.Any], cls: type[MetadataOutput]) ->
     raise RuntimeError(f'Malformed metadata (Server-side issue): {e}') from None
 
 
-converter.register_structure_hook(MetadataOutput, _structure_metadata)
+converter.register_structure_hook(Metadata, _structure_metadata)
 
 
 @attr.define
-class Request:
-  prompt: str
-  llm_config: t.Dict[str, t.Any]
-  stop: t.Optional[t.Union[str, t.List[str]]] = attr.field(default=None)
-  adapter_name: t.Optional[str] = attr.field(default=None)
-
-  def model_dump_json(self) -> t.Dict[str, t.Any]:
-    return cattr.unstructure(self)
-
-  @classmethod
-  def model_construct(cls, data: t.Dict[str, t.Any]) -> Request:
-    return cattr.structure(data, cls)
-
-
-SampleLogprobs = t.List[t.Dict[int, float]]
-PromptLogprobs = t.List[t.Optional[t.Dict[int, float]]]
-FinishReason = t.Literal['length', 'stop']
-
-
-@attr.define
-class CompletionChunk:
-  index: int
-  text: str
-  token_ids: t.List[int]
-  cumulative_logprob: float
-  logprobs: t.Optional[SampleLogprobs] = None
-  finish_reason: t.Optional[FinishReason] = None
-
-
-@attr.define
-class Response:
-  prompt: str
-  finished: bool
-  request_id: str
-  outputs: t.List[CompletionChunk]
-  prompt_token_ids: t.Optional[t.List[int]] = attr.field(default=None)
-  prompt_logprobs: t.Optional[PromptLogprobs] = attr.field(default=None)
-
-  def model_dump_json(self) -> t.Dict[str, t.Any]:
-    return cattr.unstructure(self)
-
-  @classmethod
-  def model_construct(cls, data: t.Dict[str, t.Any]) -> Response:
-    return cattr.structure(data, cls)
-
-
-@attr.define
-class StreamingResponse:
+class StreamingResponse(_SchemaMixin):
   request_id: str
   index: int
   text: str
@@ -106,10 +68,3 @@ class StreamingResponse:
       text=response.outputs[0].text,
       token_ids=response.outputs[0].token_ids[0],
     )
-
-  def model_dump_json(self) -> t.Dict[str, t.Any]:
-    return cattr.unstructure(self)
-
-  @classmethod
-  def model_construct(cls, data: t.Dict[str, t.Any]) -> StreamingResponse:
-    return cattr.structure(data, cls)
diff --git a/openllm-client/src/openllm_client/_typing_compat.py b/openllm-client/src/openllm_client/_typing_compat.py
index ecdf6a86..ce911a0a 100644
--- a/openllm-client/src/openllm_client/_typing_compat.py
+++ b/openllm-client/src/openllm_client/_typing_compat.py
@@ -1,27 +1,13 @@
-import sys
-
 from typing import Literal
 
+from openllm_core._typing_compat import Annotated as Annotated
+from openllm_core._typing_compat import LiteralString as LiteralString
+from openllm_core._typing_compat import NotRequired as NotRequired
+from openllm_core._typing_compat import Required as Required
+from openllm_core._typing_compat import Self as Self
+from openllm_core._typing_compat import dataclass_transform as dataclass_transform
+from openllm_core._typing_compat import overload as overload
 
-if sys.version_info[:2] >= (3, 11):
-  from typing import LiteralString as LiteralString
-  from typing import NotRequired as NotRequired
-  from typing import Required as Required
-  from typing import Self as Self
-  from typing import dataclass_transform as dataclass_transform
-  from typing import overload as overload
-else:
-  from typing_extensions import LiteralString as LiteralString
-  from typing_extensions import NotRequired as NotRequired
-  from typing_extensions import Required as Required
-  from typing_extensions import Self as Self
-  from typing_extensions import dataclass_transform as dataclass_transform
-  from typing_extensions import overload as overload
-
-if sys.version_info[:2] >= (3, 9):
-  from typing import Annotated as Annotated
-else:
-  from typing_extensions import Annotated as Annotated
 
 Platform = Annotated[
   LiteralString, Literal['MacOS', 'Linux', 'Windows', 'FreeBSD', 'OpenBSD', 'iOS', 'iPadOS', 'Android', 'Unknown'], str
diff --git a/openllm-client/src/openllm_client/_utils.py b/openllm-client/src/openllm_client/_utils.py
index d5ddbf30..b6f27d56 100644
--- a/openllm-client/src/openllm_client/_utils.py
+++ b/openllm-client/src/openllm_client/_utils.py
@@ -1,6 +1,11 @@
-from __future__ import annotations
-
-from cattr import Converter
+import openllm_core
 
 
-converter = Converter(omit_if_default=True)
+def __dir__():
+  return dir(openllm_core.utils)
+
+
+def __getattr__(name):
+  if hasattr(openllm_core.utils, name):
+    return getattr(openllm_core.utils, name)
+  raise AttributeError(f'module {__name__} has no attribute {name}')
diff --git a/openllm-client/src/openllm_client/_utils.pyi b/openllm-client/src/openllm_client/_utils.pyi
new file mode 100644
index 00000000..142336e7
--- /dev/null
+++ b/openllm-client/src/openllm_client/_utils.pyi
@@ -0,0 +1,49 @@
+from openllm_core.utils import calc_dir_size as calc_dir_size
+from openllm_core.utils import check_bool_env as check_bool_env
+from openllm_core.utils import configure_logging as configure_logging
+from openllm_core.utils import field_env_key as field_env_key
+from openllm_core.utils import first_not_none as first_not_none
+from openllm_core.utils import flatten_attrs as flatten_attrs
+from openllm_core.utils import gen_random_uuid as gen_random_uuid
+from openllm_core.utils import generate_context as generate_context
+from openllm_core.utils import generate_hash_from_file as generate_hash_from_file
+from openllm_core.utils import get_debug_mode as get_debug_mode
+from openllm_core.utils import get_quiet_mode as get_quiet_mode
+from openllm_core.utils import in_notebook as in_notebook
+from openllm_core.utils import lenient_issubclass as lenient_issubclass
+from openllm_core.utils import reserve_free_port as reserve_free_port
+from openllm_core.utils import resolve_filepath as resolve_filepath
+from openllm_core.utils import resolve_user_filepath as resolve_user_filepath
+from openllm_core.utils import set_debug_mode as set_debug_mode
+from openllm_core.utils import set_quiet_mode as set_quiet_mode
+from openllm_core.utils import validate_is_path as validate_is_path
+from openllm_core.utils import DEBUG as DEBUG
+from openllm_core.utils import DEBUG_ENV_VAR as DEBUG_ENV_VAR
+from openllm_core.utils import DEV_DEBUG_VAR as DEV_DEBUG_VAR
+from openllm_core.utils import MYPY as MYPY
+from openllm_core.utils import QUIET_ENV_VAR as QUIET_ENV_VAR
+from openllm_core.utils import SHOW_CODEGEN as SHOW_CODEGEN
+from openllm_core.utils.lazy import LazyLoader as LazyLoader
+from openllm_core.utils.lazy import LazyModule as LazyModule
+from openllm_core.utils.lazy import VersionInfo as VersionInfo
+from openllm_core.utils import analytics as analytics
+from openllm_core.utils import codegen as codegen
+from openllm_core.utils import dantic as dantic
+from openllm_core.utils import serde as serde
+from openllm_core.utils.import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES
+from openllm_core.utils.import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES
+from openllm_core.utils.import_utils import is_autoawq_available as is_autoawq_available
+from openllm_core.utils.import_utils import is_autogptq_available as is_autogptq_available
+from openllm_core.utils.import_utils import is_bentoml_available as is_bentoml_available
+from openllm_core.utils.import_utils import is_bitsandbytes_available as is_bitsandbytes_available
+from openllm_core.utils.import_utils import is_grpc_available as is_grpc_available
+from openllm_core.utils.import_utils import is_jupyter_available as is_jupyter_available
+from openllm_core.utils.import_utils import is_jupytext_available as is_jupytext_available
+from openllm_core.utils.import_utils import is_notebook_available as is_notebook_available
+from openllm_core.utils.import_utils import is_optimum_supports_gptq as is_optimum_supports_gptq
+from openllm_core.utils.import_utils import is_peft_available as is_peft_available
+from openllm_core.utils.import_utils import is_torch_available as is_torch_available
+from openllm_core.utils.import_utils import is_transformers_available as is_transformers_available
+from openllm_core.utils.import_utils import is_vllm_available as is_vllm_available
+from openllm_core.utils.representation import ReprMixin as ReprMixin
+from openllm_core.utils.serde import converter as converter
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index 97af8e1e..da04f2f2 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -395,31 +395,39 @@ _extras: dict[str, t.Any] = {
   if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_'))
 }
 _extras['__openllm_migration__'] = {'bentoml_cattr': 'converter'}
-_import_structure: dict[str, list[str]] = {
-  'analytics': [],
-  'codegen': [],
-  'dantic': [],
-  'lazy': [],
-  'pkg': [],
-  'representation': ['ReprMixin'],
-  'serde': ['converter'],
-  'import_utils': [
-    'OPTIONAL_DEPENDENCIES',
-    'is_vllm_available',
-    'is_torch_available',
-    'is_bitsandbytes_available',
-    'is_peft_available',
-    'is_jupyter_available',
-    'is_jupytext_available',
-    'is_notebook_available',
-    'is_autogptq_available',
-    'is_grpc_available',
-    'is_transformers_available',
-    'is_optimum_supports_gptq',
-    'is_autoawq_available',
-    'is_bentoml_available',
-  ],
-}
+__lazy = LazyModule(
+  __name__,
+  globals()['__file__'],
+  {
+    'analytics': [],
+    'codegen': [],
+    'dantic': [],
+    'lazy': [],
+    'pkg': [],
+    'representation': ['ReprMixin'],
+    'serde': ['converter'],
+    'import_utils': [
+      'OPTIONAL_DEPENDENCIES',
+      'is_vllm_available',
+      'is_torch_available',
+      'is_bitsandbytes_available',
+      'is_peft_available',
+      'is_jupyter_available',
+      'is_jupytext_available',
+      'is_notebook_available',
+      'is_autogptq_available',
+      'is_grpc_available',
+      'is_transformers_available',
+      'is_optimum_supports_gptq',
+      'is_autoawq_available',
+      'is_bentoml_available',
+    ],
+  },
+  extra_objects=_extras,
+)
+__all__ = __lazy.__all__
+__dir__ = __lazy.__dir__
+__getattr__ = __lazy.__getattr__
 
 if t.TYPE_CHECKING:
   # NOTE: The following exports useful utils from bentoml
@@ -443,8 +451,3 @@ if t.TYPE_CHECKING:
   from .import_utils import is_vllm_available as is_vllm_available
   from .representation import ReprMixin as ReprMixin
   from .serde import converter as converter
-
-__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects=_extras)
-__all__ = __lazy.__all__
-__dir__ = __lazy.__dir__
-__getattr__ = __lazy.__getattr__
diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py
index 37ed9bb0..50d1f19c 100644
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -67,5 +67,35 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
   return _Metadata
 
 
+class MessagesConverterInput(t.TypedDict):
+  add_generation_prompt: bool
+  messages: t.List[t.Dict[str, t.Any]]
+
+
+class MessageParam(t.TypedDict):
+  role: t.Literal['system', 'user', 'assistant']
+  content: str
+
+
+@svc.api(
+  route='/v1/helpers/messages',
+  input=JSON.from_sample(
+    MessagesConverterInput(
+      add_generation_prompt=False,
+      messages=[
+        MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
+        MessageParam(role='user', content='Hi there!'),
+        MessageParam(role='assistant', content='Yes?'),
+      ],
+    )
+  ),
+  output=Text(),
+)
+def helpers_messages_v1(message: MessagesConverterInput) -> str:
+  add_generation_prompt = message['add_generation_prompt']
+  messages = message['messages']
+  return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
+
+
 # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
 openllm.mount_entrypoints(svc, llm)
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index 2438da98..d02f594a 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -93,7 +93,7 @@ __all__ = ['generate_labels', 'available_devices', 'device_count']
 
 
 def __dir__() -> t.Sequence[str]:
-  return sorted(__all__)
+  return sorted(__all__) + sorted(dir(openllm_core.utils))
 
 
 def __getattr__(it: str) -> t.Any: