From fad4186dbca92049af1885fe45eb3987ce18f5b5 Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Sun, 12 Nov 2023 01:02:27 -0500 Subject: [PATCH] feat(server): helpers endpoints for conversation format (#613) * feat: add support for helpers conversation conversion endpoint also correct schema generation for openllm client Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update clients to reuse `openllm-core` logics Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: add changelog Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --- changelog.d/613.feature.md | 3 + mypy.ini | 2 +- openllm-client/pyproject.toml | 9 +-- openllm-client/src/openllm_client/_http.py | 29 +++---- openllm-client/src/openllm_client/_schemas.py | 75 ++++--------------- .../src/openllm_client/_typing_compat.py | 28 ++----- openllm-client/src/openllm_client/_utils.py | 13 +++- openllm-client/src/openllm_client/_utils.pyi | 49 ++++++++++++ .../src/openllm_core/utils/__init__.py | 63 ++++++++-------- openllm-python/src/openllm/_service.py | 30 ++++++++ openllm-python/src/openllm/utils/__init__.py | 2 +- 11 files changed, 160 insertions(+), 143 deletions(-) create mode 100644 changelog.d/613.feature.md create mode 100644 openllm-client/src/openllm_client/_utils.pyi diff --git a/changelog.d/613.feature.md b/changelog.d/613.feature.md new file mode 100644 index 00000000..999a39ff --- /dev/null +++ b/changelog.d/613.feature.md @@ -0,0 +1,3 @@ +OpenLLM server now provides a helpers endpoint to help easily create new prompt and other utilities in the future + +`/v1/helpers/messages` will format a list of messages into the correct chat messages given the chat model diff --git a/mypy.ini b/mypy.ini index 60315ebd..9648d989 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,4 +7,4 @@ warn_unused_configs = True ignore_missing_imports = true check_untyped_defs = true warn_unreachable = true -files = openllm-client/src/openllm_client/__init__.pyi, openllm-core/src/openllm_core/_typing_compat.py, openllm-client/src/openllm_client/_typing_compat.py, openllm-python/src/openllm/__init__.pyi, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/serialisation/__init__.pyi +files = openllm-client/src/openllm_client/__init__.pyi, openllm-client/src/openllm_client/_utils.pyi, openllm-core/src/openllm_core/_typing_compat.py, openllm-client/src/openllm_client/_typing_compat.py, openllm-python/src/openllm/__init__.pyi, openllm-python/src/openllm/client.pyi, openllm-python/src/openllm/serialisation/__init__.pyi diff --git a/openllm-client/pyproject.toml b/openllm-client/pyproject.toml index fa7baf09..238dbbe1 100644 --- a/openllm-client/pyproject.toml +++ b/openllm-client/pyproject.toml @@ -13,14 +13,9 @@ authors = [ ] dynamic = ['readme', 'version'] classifiers = [ - "Development Status :: 5 - Production/Stable", - "Environment :: GPU :: NVIDIA CUDA", - "Environment :: GPU :: NVIDIA CUDA :: 12", - "Environment :: GPU :: NVIDIA CUDA :: 11.8", - "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License", "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries", "Operating System :: OS Independent", "Intended Audience :: Developers", @@ -57,7 +52,7 @@ keywords = [ "PyTorch", "Transformers", ] -dependencies = ["orjson", "httpx", "attrs>=23.1.0", "cattrs>=23.1.0", 'distro', 'anyio'] +dependencies = ["openllm-core", "httpx", "distro", "anyio"] license = "Apache-2.0" name = "openllm-client" requires-python = ">=3.8" diff --git a/openllm-client/src/openllm_client/_http.py b/openllm-client/src/openllm_client/_http.py index 9052e7a2..83d61d88 100644 --- a/openllm-client/src/openllm_client/_http.py +++ b/openllm-client/src/openllm_client/_http.py @@ -1,5 +1,4 @@ from __future__ import annotations -import asyncio import importlib.metadata import logging import os @@ -7,7 +6,7 @@ import typing as t import attr -from ._schemas import MetadataOutput +from ._schemas import Metadata from ._schemas import Response from ._schemas import StreamingResponse from ._shim import MAX_RETRIES @@ -28,7 +27,7 @@ def _address_converter(addr: str): class HTTPClient(Client): _api_version: str = 'v1' _verify: bool = True - __metadata: MetadataOutput | None = None + __metadata: Metadata | None = None __config: dict[str, t.Any] | None = None def __repr__(self): @@ -55,9 +54,7 @@ class HTTPClient(Client): def _metadata(self): if self.__metadata is None: path = f'/{self._api_version}/metadata' - self.__metadata = self._post( - path, response_cls=MetadataOutput, json={}, options={'max_retries': self._max_retries} - ) + self.__metadata = self._post(path, response_cls=Metadata, json={}, options={'max_retries': self._max_retries}) return self.__metadata @property @@ -93,6 +90,7 @@ class HTTPClient(Client): f'/{self._api_version}/generate', response_cls=Response, json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name), + options={'max_retries': self._max_retries}, ) def generate_stream( @@ -118,6 +116,7 @@ class HTTPClient(Client): f'/{self._api_version}/generate_stream', response_cls=Response, json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name), + options={'max_retries': self._max_retries}, stream=True, ) @@ -126,7 +125,7 @@ class HTTPClient(Client): class AsyncHTTPClient(AsyncClient): _api_version: str = 'v1' _verify: bool = True - __metadata: MetadataOutput | None = None + __metadata: Metadata | None = None __config: dict[str, t.Any] | None = None def __repr__(self): @@ -147,20 +146,10 @@ class AsyncHTTPClient(AsyncClient): return super()._build_auth_headers() @property - def _loop(self) -> asyncio.AbstractEventLoop: - try: - return asyncio.get_running_loop() - except RuntimeError: - return asyncio.get_event_loop() - - @property - async def _metadata(self) -> t.Awaitable[MetadataOutput]: + async def _metadata(self) -> t.Awaitable[Metadata]: if self.__metadata is None: self.__metadata = await self._post( - f'/{self._api_version}/metadata', - response_cls=MetadataOutput, - json={}, - options={'max_retries': self._max_retries}, + f'/{self._api_version}/metadata', response_cls=Metadata, json={}, options={'max_retries': self._max_retries} ) return self.__metadata @@ -198,6 +187,7 @@ class AsyncHTTPClient(AsyncClient): f'/{self._api_version}/generate', response_cls=Response, json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name), + options={'max_retries': self._max_retries}, ) async def generate_stream( @@ -228,6 +218,7 @@ class AsyncHTTPClient(AsyncClient): f'/{self._api_version}/generate_stream', response_cls=Response, json=dict(prompt=prompt, llm_config=llm_config, stop=stop, adapter_name=adapter_name), + options={'max_retries': self._max_retries}, stream=True, ): yield response_chunk diff --git a/openllm-client/src/openllm_client/_schemas.py b/openllm-client/src/openllm_client/_schemas.py index 235e4896..09a49d53 100644 --- a/openllm-client/src/openllm_client/_schemas.py +++ b/openllm-client/src/openllm_client/_schemas.py @@ -2,15 +2,24 @@ from __future__ import annotations import typing as t import attr -import cattr import orjson +from openllm_core._schemas import CompletionChunk as CompletionChunk +from openllm_core._schemas import GenerationOutput as Response # backward compatibility +from openllm_core._schemas import _SchemaMixin as _SchemaMixin + from ._utils import converter -# XXX: sync with openllm-core/src/openllm_core/_schemas.py +__all__ = ['Response', 'CompletionChunk', 'Metadata', 'StreamingResponse'] + + @attr.define -class MetadataOutput: +class Metadata(_SchemaMixin): + """NOTE: Metadata is a modified version of the original MetadataOutput from openllm-core. + + The configuration is now structured into a dictionary for easy of use.""" + model_id: str timeout: int model_name: str @@ -20,7 +29,7 @@ class MetadataOutput: system_message: t.Optional[str] -def _structure_metadata(data: t.Dict[str, t.Any], cls: type[MetadataOutput]) -> MetadataOutput: +def _structure_metadata(data: t.Dict[str, t.Any], cls: type[Metadata]) -> Metadata: try: configuration = orjson.loads(data['configuration']) generation_config = configuration.pop('generation_config') @@ -41,58 +50,11 @@ def _structure_metadata(data: t.Dict[str, t.Any], cls: type[MetadataOutput]) -> raise RuntimeError(f'Malformed metadata (Server-side issue): {e}') from None -converter.register_structure_hook(MetadataOutput, _structure_metadata) +converter.register_structure_hook(Metadata, _structure_metadata) @attr.define -class Request: - prompt: str - llm_config: t.Dict[str, t.Any] - stop: t.Optional[t.Union[str, t.List[str]]] = attr.field(default=None) - adapter_name: t.Optional[str] = attr.field(default=None) - - def model_dump_json(self) -> t.Dict[str, t.Any]: - return cattr.unstructure(self) - - @classmethod - def model_construct(cls, data: t.Dict[str, t.Any]) -> Request: - return cattr.structure(data, cls) - - -SampleLogprobs = t.List[t.Dict[int, float]] -PromptLogprobs = t.List[t.Optional[t.Dict[int, float]]] -FinishReason = t.Literal['length', 'stop'] - - -@attr.define -class CompletionChunk: - index: int - text: str - token_ids: t.List[int] - cumulative_logprob: float - logprobs: t.Optional[SampleLogprobs] = None - finish_reason: t.Optional[FinishReason] = None - - -@attr.define -class Response: - prompt: str - finished: bool - request_id: str - outputs: t.List[CompletionChunk] - prompt_token_ids: t.Optional[t.List[int]] = attr.field(default=None) - prompt_logprobs: t.Optional[PromptLogprobs] = attr.field(default=None) - - def model_dump_json(self) -> t.Dict[str, t.Any]: - return cattr.unstructure(self) - - @classmethod - def model_construct(cls, data: t.Dict[str, t.Any]) -> Response: - return cattr.structure(data, cls) - - -@attr.define -class StreamingResponse: +class StreamingResponse(_SchemaMixin): request_id: str index: int text: str @@ -106,10 +68,3 @@ class StreamingResponse: text=response.outputs[0].text, token_ids=response.outputs[0].token_ids[0], ) - - def model_dump_json(self) -> t.Dict[str, t.Any]: - return cattr.unstructure(self) - - @classmethod - def model_construct(cls, data: t.Dict[str, t.Any]) -> StreamingResponse: - return cattr.structure(data, cls) diff --git a/openllm-client/src/openllm_client/_typing_compat.py b/openllm-client/src/openllm_client/_typing_compat.py index ecdf6a86..ce911a0a 100644 --- a/openllm-client/src/openllm_client/_typing_compat.py +++ b/openllm-client/src/openllm_client/_typing_compat.py @@ -1,27 +1,13 @@ -import sys - from typing import Literal +from openllm_core._typing_compat import Annotated as Annotated +from openllm_core._typing_compat import LiteralString as LiteralString +from openllm_core._typing_compat import NotRequired as NotRequired +from openllm_core._typing_compat import Required as Required +from openllm_core._typing_compat import Self as Self +from openllm_core._typing_compat import dataclass_transform as dataclass_transform +from openllm_core._typing_compat import overload as overload -if sys.version_info[:2] >= (3, 11): - from typing import LiteralString as LiteralString - from typing import NotRequired as NotRequired - from typing import Required as Required - from typing import Self as Self - from typing import dataclass_transform as dataclass_transform - from typing import overload as overload -else: - from typing_extensions import LiteralString as LiteralString - from typing_extensions import NotRequired as NotRequired - from typing_extensions import Required as Required - from typing_extensions import Self as Self - from typing_extensions import dataclass_transform as dataclass_transform - from typing_extensions import overload as overload - -if sys.version_info[:2] >= (3, 9): - from typing import Annotated as Annotated -else: - from typing_extensions import Annotated as Annotated Platform = Annotated[ LiteralString, Literal['MacOS', 'Linux', 'Windows', 'FreeBSD', 'OpenBSD', 'iOS', 'iPadOS', 'Android', 'Unknown'], str diff --git a/openllm-client/src/openllm_client/_utils.py b/openllm-client/src/openllm_client/_utils.py index d5ddbf30..b6f27d56 100644 --- a/openllm-client/src/openllm_client/_utils.py +++ b/openllm-client/src/openllm_client/_utils.py @@ -1,6 +1,11 @@ -from __future__ import annotations - -from cattr import Converter +import openllm_core -converter = Converter(omit_if_default=True) +def __dir__(): + return dir(openllm_core.utils) + + +def __getattr__(name): + if hasattr(openllm_core.utils, name): + return getattr(openllm_core.utils, name) + raise AttributeError(f'module {__name__} has no attribute {name}') diff --git a/openllm-client/src/openllm_client/_utils.pyi b/openllm-client/src/openllm_client/_utils.pyi new file mode 100644 index 00000000..142336e7 --- /dev/null +++ b/openllm-client/src/openllm_client/_utils.pyi @@ -0,0 +1,49 @@ +from openllm_core.utils import calc_dir_size as calc_dir_size +from openllm_core.utils import check_bool_env as check_bool_env +from openllm_core.utils import configure_logging as configure_logging +from openllm_core.utils import field_env_key as field_env_key +from openllm_core.utils import first_not_none as first_not_none +from openllm_core.utils import flatten_attrs as flatten_attrs +from openllm_core.utils import gen_random_uuid as gen_random_uuid +from openllm_core.utils import generate_context as generate_context +from openllm_core.utils import generate_hash_from_file as generate_hash_from_file +from openllm_core.utils import get_debug_mode as get_debug_mode +from openllm_core.utils import get_quiet_mode as get_quiet_mode +from openllm_core.utils import in_notebook as in_notebook +from openllm_core.utils import lenient_issubclass as lenient_issubclass +from openllm_core.utils import reserve_free_port as reserve_free_port +from openllm_core.utils import resolve_filepath as resolve_filepath +from openllm_core.utils import resolve_user_filepath as resolve_user_filepath +from openllm_core.utils import set_debug_mode as set_debug_mode +from openllm_core.utils import set_quiet_mode as set_quiet_mode +from openllm_core.utils import validate_is_path as validate_is_path +from openllm_core.utils import DEBUG as DEBUG +from openllm_core.utils import DEBUG_ENV_VAR as DEBUG_ENV_VAR +from openllm_core.utils import DEV_DEBUG_VAR as DEV_DEBUG_VAR +from openllm_core.utils import MYPY as MYPY +from openllm_core.utils import QUIET_ENV_VAR as QUIET_ENV_VAR +from openllm_core.utils import SHOW_CODEGEN as SHOW_CODEGEN +from openllm_core.utils.lazy import LazyLoader as LazyLoader +from openllm_core.utils.lazy import LazyModule as LazyModule +from openllm_core.utils.lazy import VersionInfo as VersionInfo +from openllm_core.utils import analytics as analytics +from openllm_core.utils import codegen as codegen +from openllm_core.utils import dantic as dantic +from openllm_core.utils import serde as serde +from openllm_core.utils.import_utils import ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES +from openllm_core.utils.import_utils import OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES +from openllm_core.utils.import_utils import is_autoawq_available as is_autoawq_available +from openllm_core.utils.import_utils import is_autogptq_available as is_autogptq_available +from openllm_core.utils.import_utils import is_bentoml_available as is_bentoml_available +from openllm_core.utils.import_utils import is_bitsandbytes_available as is_bitsandbytes_available +from openllm_core.utils.import_utils import is_grpc_available as is_grpc_available +from openllm_core.utils.import_utils import is_jupyter_available as is_jupyter_available +from openllm_core.utils.import_utils import is_jupytext_available as is_jupytext_available +from openllm_core.utils.import_utils import is_notebook_available as is_notebook_available +from openllm_core.utils.import_utils import is_optimum_supports_gptq as is_optimum_supports_gptq +from openllm_core.utils.import_utils import is_peft_available as is_peft_available +from openllm_core.utils.import_utils import is_torch_available as is_torch_available +from openllm_core.utils.import_utils import is_transformers_available as is_transformers_available +from openllm_core.utils.import_utils import is_vllm_available as is_vllm_available +from openllm_core.utils.representation import ReprMixin as ReprMixin +from openllm_core.utils.serde import converter as converter diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 97af8e1e..da04f2f2 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -395,31 +395,39 @@ _extras: dict[str, t.Any] = { if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_')) } _extras['__openllm_migration__'] = {'bentoml_cattr': 'converter'} -_import_structure: dict[str, list[str]] = { - 'analytics': [], - 'codegen': [], - 'dantic': [], - 'lazy': [], - 'pkg': [], - 'representation': ['ReprMixin'], - 'serde': ['converter'], - 'import_utils': [ - 'OPTIONAL_DEPENDENCIES', - 'is_vllm_available', - 'is_torch_available', - 'is_bitsandbytes_available', - 'is_peft_available', - 'is_jupyter_available', - 'is_jupytext_available', - 'is_notebook_available', - 'is_autogptq_available', - 'is_grpc_available', - 'is_transformers_available', - 'is_optimum_supports_gptq', - 'is_autoawq_available', - 'is_bentoml_available', - ], -} +__lazy = LazyModule( + __name__, + globals()['__file__'], + { + 'analytics': [], + 'codegen': [], + 'dantic': [], + 'lazy': [], + 'pkg': [], + 'representation': ['ReprMixin'], + 'serde': ['converter'], + 'import_utils': [ + 'OPTIONAL_DEPENDENCIES', + 'is_vllm_available', + 'is_torch_available', + 'is_bitsandbytes_available', + 'is_peft_available', + 'is_jupyter_available', + 'is_jupytext_available', + 'is_notebook_available', + 'is_autogptq_available', + 'is_grpc_available', + 'is_transformers_available', + 'is_optimum_supports_gptq', + 'is_autoawq_available', + 'is_bentoml_available', + ], + }, + extra_objects=_extras, +) +__all__ = __lazy.__all__ +__dir__ = __lazy.__dir__ +__getattr__ = __lazy.__getattr__ if t.TYPE_CHECKING: # NOTE: The following exports useful utils from bentoml @@ -443,8 +451,3 @@ if t.TYPE_CHECKING: from .import_utils import is_vllm_available as is_vllm_available from .representation import ReprMixin as ReprMixin from .serde import converter as converter - -__lazy = LazyModule(__name__, globals()['__file__'], _import_structure, extra_objects=_extras) -__all__ = __lazy.__all__ -__dir__ = __lazy.__dir__ -__getattr__ = __lazy.__getattr__ diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 37ed9bb0..50d1f19c 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -67,5 +67,35 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: return _Metadata +class MessagesConverterInput(t.TypedDict): + add_generation_prompt: bool + messages: t.List[t.Dict[str, t.Any]] + + +class MessageParam(t.TypedDict): + role: t.Literal['system', 'user', 'assistant'] + content: str + + +@svc.api( + route='/v1/helpers/messages', + input=JSON.from_sample( + MessagesConverterInput( + add_generation_prompt=False, + messages=[ + MessageParam(role='system', content='You are acting as Ernest Hemmingway.'), + MessageParam(role='user', content='Hi there!'), + MessageParam(role='assistant', content='Yes?'), + ], + ) + ), + output=Text(), +) +def helpers_messages_v1(message: MessagesConverterInput) -> str: + add_generation_prompt = message['add_generation_prompt'] + messages = message['messages'] + return llm.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False) + + # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema. openllm.mount_entrypoints(svc, llm) diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index 2438da98..d02f594a 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -93,7 +93,7 @@ __all__ = ['generate_labels', 'available_devices', 'device_count'] def __dir__() -> t.Sequence[str]: - return sorted(__all__) + return sorted(__all__) + sorted(dir(openllm_core.utils)) def __getattr__(it: str) -> t.Any: