chore(service): cleanup API (#579)

* chore(service): cleanup API

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: running tools

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* fix: tests import

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-08 02:53:08 -05:00
committed by GitHub
parent 7398ae0486
commit ea42108e45
8 changed files with 70 additions and 71 deletions

View File

@@ -1,7 +1,6 @@
# mypy: disable-error-code="attr-defined,no-untyped-call,type-var,operator,arg-type,no-redef,misc"
from __future__ import annotations
import copy
import enum
import logging
import os
import sys
@@ -39,7 +38,6 @@ from ._typing_compat import Required
from ._typing_compat import Self
from ._typing_compat import overload
from .exceptions import ForbiddenAttributeError
from .utils import MYPY
from .utils import LazyLoader
from .utils import ReprMixin
from .utils import codegen
@@ -48,8 +46,10 @@ from .utils import dantic
from .utils import field_env_key
from .utils import first_not_none
from .utils import lenient_issubclass
from .utils.peft import FineTuneConfig, PEFT_TASK_TYPE_TARGET_MAPPING, PeftType
from .utils.import_utils import is_vllm_available
from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING
from .utils.peft import FineTuneConfig
from .utils.peft import PeftType
if t.TYPE_CHECKING:
import click

View File

@@ -14,11 +14,6 @@ from .utils import gen_random_uuid
if t.TYPE_CHECKING:
import vllm
import openllm
from ._typing_compat import M
from ._typing_compat import T
@attr.frozen(slots=True)
class MetadataOutput:
model_id: str
@@ -29,20 +24,11 @@ class MetadataOutput:
prompt_template: str
system_message: str
@classmethod
def examples(cls, llm: openllm.LLM[M, T]) -> MetadataOutput:
return cls(model_id=llm.model_id,
timeout=llm.config['timeout'],
model_name=llm.config['model_name'],
backend=llm.__llm_backend__,
configuration=llm.config.model_dump_json().decode(),
prompt_template='{system_message}',
system_message='You are a helpful assistant.')
def model_dump(self) -> dict[str, t.Any]:
return converter.unstructure(self)
# yapf: disable
def model_dump(self)->dict[str, t.Any]: return converter.unstructure(self)
def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8')
# yapf: enable
def model_dump_json(self) -> str:
return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')
@attr.define(slots=True, frozen=True)
class GenerationInput:
@@ -51,12 +37,15 @@ class GenerationInput:
stop: list[str] | None = attr.field(default=None)
adapter_name: str | None = attr.field(default=None)
# yapf: disable
@classmethod
def from_model(cls,model_name:str,**attrs: t.Any)->type[GenerationInput]:return cls.from_llm_config(AutoConfig.for_model(model_name,**attrs))
def model_dump(self)->dict[str,t.Any]:return {'prompt': self.prompt,'stop': self.stop,'llm_config': self.llm_config.model_dump(flatten=True),'adapter_name': self.adapter_name}
def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8')
# yapf: enable
def from_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
return cls.from_llm_config(AutoConfig.for_model(model_name, **attrs))
def model_dump(self) -> dict[str, t.Any]:
return {'prompt': self.prompt, 'stop': self.stop, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name}
def model_dump_json(self) -> str:
return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')
@classmethod
def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]:
@@ -78,8 +67,8 @@ class GenerationInput:
repr=True,
collect_by_mro=True)
def examples(_: type[GenerationInput]) -> GenerationInput:
return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n'])
def examples(_: type[GenerationInput]) -> dict[str, t.Any]:
return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n']).model_dump()
setattr(klass, 'examples', classmethod(examples))
@@ -121,7 +110,7 @@ class GenerationOutput:
request_id: str = attr.field(factory=lambda: gen_random_uuid())
@classmethod
def examples(cls) -> GenerationOutput:
def examples(cls) -> dict[str, t.Any]:
return cls(prompt='What is the meaning of life?',
finished=True,
outputs=[
@@ -134,7 +123,7 @@ class GenerationOutput:
],
prompt_token_ids=[2, 2264, 16, 5, 3099, 9, 301, 116],
prompt_logprobs=None,
request_id=gen_random_uuid())
request_id=gen_random_uuid()).model_dump()
@staticmethod
def _preprocess_sse_message(data: str) -> str:
@@ -164,8 +153,11 @@ class GenerationOutput:
prompt_token_ids=request_output.prompt_token_ids,
prompt_logprobs=request_output.prompt_logprobs)
# yapf: disable
def with_options(self,**options: t.Any)->GenerationOutput: return attr.evolve(self, **options)
def model_dump(self)->dict[str, t.Any]:return converter.unstructure(self)
def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
# yapf: enable
def with_options(self, **options: t.Any) -> GenerationOutput:
return attr.evolve(self, **options)
def model_dump(self) -> dict[str, t.Any]:
return converter.unstructure(self)
def model_dump_json(self) -> str:
return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8')

View File

@@ -1,6 +1,12 @@
from __future__ import annotations
import enum, typing as t, inflection, attr
import enum
import typing as t
import attr
import inflection
from deepmerge import Merger
from . import dantic
from ..exceptions import ForbiddenAttributeError
@@ -8,8 +14,9 @@ config_merger = Merger([(dict, 'merge')], ['override'], ['override'])
if t.TYPE_CHECKING:
from peft.config import PeftConfig
from .._typing_compat import AdapterType
from .._configuration import LLMConfig
from .._typing_compat import AdapterType
# case insensitive, but rename to conform with type
class _PeftEnumMeta(enum.EnumMeta):
@@ -72,7 +79,8 @@ class FineTuneConfig:
def build(self) -> PeftConfig:
try:
from peft import TaskType, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config
from peft.utils.peft_types import TaskType
from peft.mapping import get_peft_config
except ImportError:
raise ImportError('PEFT is not installed. Please install it via `pip install "openllm[fine-tune]"`.')
adapter_config = self.adapter_config.copy()
@@ -83,7 +91,7 @@ class FineTuneConfig:
# respect user set task_type if it is passed, otherwise use one managed by OpenLLM
inference_mode = adapter_config.pop('inference_mode', self.inference_mode)
task_type = adapter_config.pop('task_type', TaskType[self.llm_config_class.peft_task_type()])
adapter_config = {'peft_type': self.adapter_type.value, "task_type": task_type, "inference_mode": inference_mode, **adapter_config}
adapter_config = {'peft_type': self.adapter_type.value, 'task_type': task_type, 'inference_mode': inference_mode, **adapter_config}
return get_peft_config(adapter_config)
def train(self) -> FineTuneConfig:

View File

@@ -67,6 +67,7 @@ _import_structure: dict[str, list[str]] = {
'protocol': [],
'utils': [],
'_deprecated': ['Runner'],
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
'entrypoints': ['mount_entrypoints'],
'serialisation': ['ggml', 'transformers'],
'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'],
@@ -84,6 +85,8 @@ if _t.TYPE_CHECKING:
from . import serialisation as serialisation
from . import testing as testing
from . import utils as utils
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
from ._strategies import get_resource as get_resource
from ._generation import LogitsProcessorList as LogitsProcessorList
from ._generation import StopOnTokens as StopOnTokens
from ._generation import StoppingCriteriaList as StoppingCriteriaList

View File

@@ -19,7 +19,6 @@ from bentoml._internal.models.model import ModelSignature
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
from openllm_core._schemas import CompletionChunk
from openllm_core._schemas import GenerationOutput
from ._strategies import CascadingResourceStrategy
from openllm_core._typing_compat import AdapterMap
from openllm_core._typing_compat import AdapterTuple
from openllm_core._typing_compat import AdapterType
@@ -51,6 +50,7 @@ from openllm_core.utils import resolve_filepath
from openllm_core.utils import validate_is_path
from ._quantisation import infer_quantisation_config
from ._strategies import CascadingResourceStrategy
from .exceptions import ForbiddenAttributeError
from .exceptions import OpenLLMException
from .serialisation.constants import PEFT_CONFIG_NAME

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
import logging
import os
import typing as t
import warnings
import _service_vars as svars
import orjson
@@ -11,47 +10,43 @@ import orjson
import bentoml
import openllm
# The following warnings from bitsandbytes, and probably not that important for users to see
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
from bentoml.io import JSON
from bentoml.io import Text
logger = logging.getLogger(__name__)
model = svars.model
model_id = svars.model_id
adapter_map = svars.adapter_map
model_tag = svars.model_tag
llm_config = openllm.AutoConfig.for_model(model)
llm = openllm.LLM[t.Any, t.Any](model_id,
llm_config = openllm.AutoConfig.for_model(svars.model)
llm = openllm.LLM[t.Any, t.Any](svars.model_id,
llm_config=llm_config,
model_tag=model_tag,
model_tag=svars.model_tag,
prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)),
system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)),
serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
adapter_map=orjson.loads(adapter_map))
adapter_map=orjson.loads(svars.adapter_map))
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])
llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)
@svc.api(route='/v1/generate', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.JSON.from_sample(openllm.GenerationOutput.examples().model_dump()))
@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
return await llm.generate(**llm_model_class(**input_dict).model_dump())
@svc.api(route='/v1/generate_stream', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.Text(content_type='text/event-stream'))
@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
yield f'data: {it.model_dump_json()}\n\n'
yield 'data: [DONE]\n\n'
@svc.api(route='/v1/metadata', input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample(openllm.MetadataOutput.examples(llm).model_dump()))
_Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'],
model_name=llm_config['model_name'],
backend=llm.__llm_backend__,
model_id=llm.model_id,
configuration=llm_config.model_dump_json().decode(),
prompt_template=llm.runner.prompt_template,
system_message=llm.runner.system_message)
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
def metadata_v1(_: str) -> openllm.MetadataOutput:
return openllm.MetadataOutput(timeout=llm_config['timeout'],
model_name=llm_config['model_name'],
backend=llm.__llm_backend__,
model_id=llm.model_id,
configuration=llm_config.model_dump_json().decode(),
prompt_template=llm.runner.prompt_template,
system_message=llm.runner.system_message)
return _Metadata
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.

View File

@@ -9,12 +9,13 @@ import types
import typing as t
import warnings
import psutil, bentoml
import psutil
import bentoml
from bentoml._internal.resource import get_resource
from bentoml._internal.resource import system_resources
from bentoml._internal.runner.strategy import THREAD_ENVS
from openllm_core._typing_compat import overload
from openllm_core.utils import DEBUG
from openllm_core.utils import ReprMixin

View File

@@ -43,7 +43,7 @@ target-version = "py312"
typing-modules = ["openllm_core._typing_compat"]
unfixable = ["TCH004"]
[flake8-type-checking]
[lint.flake8-type-checking]
exempt-modules = ["typing", "typing_extensions", "openllm_core._typing_compat"]
runtime-evaluated-base-classes = [
"openllm_core._configuration.LLMConfig",
@@ -59,13 +59,13 @@ quote-style = "single"
indent-style = "space"
skip-magic-trailing-comma = true
[pydocstyle]
[lint.pydocstyle]
convention = "google"
[pycodestyle]
[lint.pycodestyle]
ignore-overlong-task-comments = true
[isort]
[lint.isort]
combine-as-imports = true
force-single-line = true
force-wrap-aliases = false
@@ -85,13 +85,13 @@ lines-between-types = 1
no-lines-before = ["future", "standard-library"]
relative-imports-order = "closest-to-furthest"
[flake8-quotes]
[lint.flake8-quotes]
avoid-escape = false
multiline-quotes = "single"
inline-quotes = "single"
docstring-quotes = "double"
[extend-per-file-ignores]
[lint.extend-per-file-ignores]
"openllm-python/src/openllm/models/**" = ["E", "F", "I001"]
"openllm-python/tests/**/*" = ["S101", "TID252", "PT011", "S307"]
"openllm-python/src/openllm/_llm.py" = ["F811"]