mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-25 09:32:37 -04:00
chore(service): cleanup API (#579)
* chore(service): cleanup API Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: running tools Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: tests import Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
# mypy: disable-error-code="attr-defined,no-untyped-call,type-var,operator,arg-type,no-redef,misc"
|
||||
from __future__ import annotations
|
||||
import copy
|
||||
import enum
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -39,7 +38,6 @@ from ._typing_compat import Required
|
||||
from ._typing_compat import Self
|
||||
from ._typing_compat import overload
|
||||
from .exceptions import ForbiddenAttributeError
|
||||
from .utils import MYPY
|
||||
from .utils import LazyLoader
|
||||
from .utils import ReprMixin
|
||||
from .utils import codegen
|
||||
@@ -48,8 +46,10 @@ from .utils import dantic
|
||||
from .utils import field_env_key
|
||||
from .utils import first_not_none
|
||||
from .utils import lenient_issubclass
|
||||
from .utils.peft import FineTuneConfig, PEFT_TASK_TYPE_TARGET_MAPPING, PeftType
|
||||
from .utils.import_utils import is_vllm_available
|
||||
from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING
|
||||
from .utils.peft import FineTuneConfig
|
||||
from .utils.peft import PeftType
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import click
|
||||
|
||||
@@ -14,11 +14,6 @@ from .utils import gen_random_uuid
|
||||
if t.TYPE_CHECKING:
|
||||
import vllm
|
||||
|
||||
import openllm
|
||||
|
||||
from ._typing_compat import M
|
||||
from ._typing_compat import T
|
||||
|
||||
@attr.frozen(slots=True)
|
||||
class MetadataOutput:
|
||||
model_id: str
|
||||
@@ -29,20 +24,11 @@ class MetadataOutput:
|
||||
prompt_template: str
|
||||
system_message: str
|
||||
|
||||
@classmethod
|
||||
def examples(cls, llm: openllm.LLM[M, T]) -> MetadataOutput:
|
||||
return cls(model_id=llm.model_id,
|
||||
timeout=llm.config['timeout'],
|
||||
model_name=llm.config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
configuration=llm.config.model_dump_json().decode(),
|
||||
prompt_template='{system_message}',
|
||||
system_message='You are a helpful assistant.')
|
||||
def model_dump(self) -> dict[str, t.Any]:
|
||||
return converter.unstructure(self)
|
||||
|
||||
# yapf: disable
|
||||
def model_dump(self)->dict[str, t.Any]: return converter.unstructure(self)
|
||||
def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8')
|
||||
# yapf: enable
|
||||
def model_dump_json(self) -> str:
|
||||
return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')
|
||||
|
||||
@attr.define(slots=True, frozen=True)
|
||||
class GenerationInput:
|
||||
@@ -51,12 +37,15 @@ class GenerationInput:
|
||||
stop: list[str] | None = attr.field(default=None)
|
||||
adapter_name: str | None = attr.field(default=None)
|
||||
|
||||
# yapf: disable
|
||||
@classmethod
|
||||
def from_model(cls,model_name:str,**attrs: t.Any)->type[GenerationInput]:return cls.from_llm_config(AutoConfig.for_model(model_name,**attrs))
|
||||
def model_dump(self)->dict[str,t.Any]:return {'prompt': self.prompt,'stop': self.stop,'llm_config': self.llm_config.model_dump(flatten=True),'adapter_name': self.adapter_name}
|
||||
def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8')
|
||||
# yapf: enable
|
||||
def from_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
|
||||
return cls.from_llm_config(AutoConfig.for_model(model_name, **attrs))
|
||||
|
||||
def model_dump(self) -> dict[str, t.Any]:
|
||||
return {'prompt': self.prompt, 'stop': self.stop, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name}
|
||||
|
||||
def model_dump_json(self) -> str:
|
||||
return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')
|
||||
|
||||
@classmethod
|
||||
def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]:
|
||||
@@ -78,8 +67,8 @@ class GenerationInput:
|
||||
repr=True,
|
||||
collect_by_mro=True)
|
||||
|
||||
def examples(_: type[GenerationInput]) -> GenerationInput:
|
||||
return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n'])
|
||||
def examples(_: type[GenerationInput]) -> dict[str, t.Any]:
|
||||
return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n']).model_dump()
|
||||
|
||||
setattr(klass, 'examples', classmethod(examples))
|
||||
|
||||
@@ -121,7 +110,7 @@ class GenerationOutput:
|
||||
request_id: str = attr.field(factory=lambda: gen_random_uuid())
|
||||
|
||||
@classmethod
|
||||
def examples(cls) -> GenerationOutput:
|
||||
def examples(cls) -> dict[str, t.Any]:
|
||||
return cls(prompt='What is the meaning of life?',
|
||||
finished=True,
|
||||
outputs=[
|
||||
@@ -134,7 +123,7 @@ class GenerationOutput:
|
||||
],
|
||||
prompt_token_ids=[2, 2264, 16, 5, 3099, 9, 301, 116],
|
||||
prompt_logprobs=None,
|
||||
request_id=gen_random_uuid())
|
||||
request_id=gen_random_uuid()).model_dump()
|
||||
|
||||
@staticmethod
|
||||
def _preprocess_sse_message(data: str) -> str:
|
||||
@@ -164,8 +153,11 @@ class GenerationOutput:
|
||||
prompt_token_ids=request_output.prompt_token_ids,
|
||||
prompt_logprobs=request_output.prompt_logprobs)
|
||||
|
||||
# yapf: disable
|
||||
def with_options(self,**options: t.Any)->GenerationOutput: return attr.evolve(self, **options)
|
||||
def model_dump(self)->dict[str, t.Any]:return converter.unstructure(self)
|
||||
def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
|
||||
# yapf: enable
|
||||
def with_options(self, **options: t.Any) -> GenerationOutput:
|
||||
return attr.evolve(self, **options)
|
||||
|
||||
def model_dump(self) -> dict[str, t.Any]:
|
||||
return converter.unstructure(self)
|
||||
|
||||
def model_dump_json(self) -> str:
|
||||
return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
from __future__ import annotations
|
||||
import enum, typing as t, inflection, attr
|
||||
import enum
|
||||
import typing as t
|
||||
|
||||
import attr
|
||||
import inflection
|
||||
|
||||
from deepmerge import Merger
|
||||
|
||||
from . import dantic
|
||||
from ..exceptions import ForbiddenAttributeError
|
||||
|
||||
@@ -8,8 +14,9 @@ config_merger = Merger([(dict, 'merge')], ['override'], ['override'])
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from peft.config import PeftConfig
|
||||
from .._typing_compat import AdapterType
|
||||
|
||||
from .._configuration import LLMConfig
|
||||
from .._typing_compat import AdapterType
|
||||
|
||||
# case insensitive, but rename to conform with type
|
||||
class _PeftEnumMeta(enum.EnumMeta):
|
||||
@@ -72,7 +79,8 @@ class FineTuneConfig:
|
||||
|
||||
def build(self) -> PeftConfig:
|
||||
try:
|
||||
from peft import TaskType, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config
|
||||
from peft.utils.peft_types import TaskType
|
||||
from peft.mapping import get_peft_config
|
||||
except ImportError:
|
||||
raise ImportError('PEFT is not installed. Please install it via `pip install "openllm[fine-tune]"`.')
|
||||
adapter_config = self.adapter_config.copy()
|
||||
@@ -83,7 +91,7 @@ class FineTuneConfig:
|
||||
# respect user set task_type if it is passed, otherwise use one managed by OpenLLM
|
||||
inference_mode = adapter_config.pop('inference_mode', self.inference_mode)
|
||||
task_type = adapter_config.pop('task_type', TaskType[self.llm_config_class.peft_task_type()])
|
||||
adapter_config = {'peft_type': self.adapter_type.value, "task_type": task_type, "inference_mode": inference_mode, **adapter_config}
|
||||
adapter_config = {'peft_type': self.adapter_type.value, 'task_type': task_type, 'inference_mode': inference_mode, **adapter_config}
|
||||
return get_peft_config(adapter_config)
|
||||
|
||||
def train(self) -> FineTuneConfig:
|
||||
|
||||
@@ -67,6 +67,7 @@ _import_structure: dict[str, list[str]] = {
|
||||
'protocol': [],
|
||||
'utils': [],
|
||||
'_deprecated': ['Runner'],
|
||||
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
|
||||
'entrypoints': ['mount_entrypoints'],
|
||||
'serialisation': ['ggml', 'transformers'],
|
||||
'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'],
|
||||
@@ -84,6 +85,8 @@ if _t.TYPE_CHECKING:
|
||||
from . import serialisation as serialisation
|
||||
from . import testing as testing
|
||||
from . import utils as utils
|
||||
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
|
||||
from ._strategies import get_resource as get_resource
|
||||
from ._generation import LogitsProcessorList as LogitsProcessorList
|
||||
from ._generation import StopOnTokens as StopOnTokens
|
||||
from ._generation import StoppingCriteriaList as StoppingCriteriaList
|
||||
|
||||
@@ -19,7 +19,6 @@ from bentoml._internal.models.model import ModelSignature
|
||||
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
|
||||
from openllm_core._schemas import CompletionChunk
|
||||
from openllm_core._schemas import GenerationOutput
|
||||
from ._strategies import CascadingResourceStrategy
|
||||
from openllm_core._typing_compat import AdapterMap
|
||||
from openllm_core._typing_compat import AdapterTuple
|
||||
from openllm_core._typing_compat import AdapterType
|
||||
@@ -51,6 +50,7 @@ from openllm_core.utils import resolve_filepath
|
||||
from openllm_core.utils import validate_is_path
|
||||
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from ._strategies import CascadingResourceStrategy
|
||||
from .exceptions import ForbiddenAttributeError
|
||||
from .exceptions import OpenLLMException
|
||||
from .serialisation.constants import PEFT_CONFIG_NAME
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
import logging
|
||||
import os
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import _service_vars as svars
|
||||
import orjson
|
||||
@@ -11,47 +10,43 @@ import orjson
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
# The following warnings from bitsandbytes, and probably not that important for users to see
|
||||
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
|
||||
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
|
||||
warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
|
||||
from bentoml.io import JSON
|
||||
from bentoml.io import Text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
model = svars.model
|
||||
model_id = svars.model_id
|
||||
adapter_map = svars.adapter_map
|
||||
model_tag = svars.model_tag
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id,
|
||||
llm_config = openllm.AutoConfig.for_model(svars.model)
|
||||
llm = openllm.LLM[t.Any, t.Any](svars.model_id,
|
||||
llm_config=llm_config,
|
||||
model_tag=model_tag,
|
||||
model_tag=svars.model_tag,
|
||||
prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)),
|
||||
system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)),
|
||||
serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
|
||||
adapter_map=orjson.loads(adapter_map))
|
||||
adapter_map=orjson.loads(svars.adapter_map))
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])
|
||||
|
||||
llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)
|
||||
|
||||
@svc.api(route='/v1/generate', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.JSON.from_sample(openllm.GenerationOutput.examples().model_dump()))
|
||||
@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
return await llm.generate(**llm_model_class(**input_dict).model_dump())
|
||||
|
||||
@svc.api(route='/v1/generate_stream', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.Text(content_type='text/event-stream'))
|
||||
@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
|
||||
yield f'data: {it.model_dump_json()}\n\n'
|
||||
yield 'data: [DONE]\n\n'
|
||||
|
||||
@svc.api(route='/v1/metadata', input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample(openllm.MetadataOutput.examples(llm).model_dump()))
|
||||
_Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'],
|
||||
model_name=llm_config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
model_id=llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
prompt_template=llm.runner.prompt_template,
|
||||
system_message=llm.runner.system_message)
|
||||
|
||||
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return openllm.MetadataOutput(timeout=llm_config['timeout'],
|
||||
model_name=llm_config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
model_id=llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
prompt_template=llm.runner.prompt_template,
|
||||
system_message=llm.runner.system_message)
|
||||
return _Metadata
|
||||
|
||||
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
|
||||
|
||||
@@ -9,12 +9,13 @@ import types
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import psutil, bentoml
|
||||
import psutil
|
||||
|
||||
import bentoml
|
||||
|
||||
from bentoml._internal.resource import get_resource
|
||||
from bentoml._internal.resource import system_resources
|
||||
from bentoml._internal.runner.strategy import THREAD_ENVS
|
||||
|
||||
from openllm_core._typing_compat import overload
|
||||
from openllm_core.utils import DEBUG
|
||||
from openllm_core.utils import ReprMixin
|
||||
|
||||
12
ruff.toml
12
ruff.toml
@@ -43,7 +43,7 @@ target-version = "py312"
|
||||
typing-modules = ["openllm_core._typing_compat"]
|
||||
unfixable = ["TCH004"]
|
||||
|
||||
[flake8-type-checking]
|
||||
[lint.flake8-type-checking]
|
||||
exempt-modules = ["typing", "typing_extensions", "openllm_core._typing_compat"]
|
||||
runtime-evaluated-base-classes = [
|
||||
"openllm_core._configuration.LLMConfig",
|
||||
@@ -59,13 +59,13 @@ quote-style = "single"
|
||||
indent-style = "space"
|
||||
skip-magic-trailing-comma = true
|
||||
|
||||
[pydocstyle]
|
||||
[lint.pydocstyle]
|
||||
convention = "google"
|
||||
|
||||
[pycodestyle]
|
||||
[lint.pycodestyle]
|
||||
ignore-overlong-task-comments = true
|
||||
|
||||
[isort]
|
||||
[lint.isort]
|
||||
combine-as-imports = true
|
||||
force-single-line = true
|
||||
force-wrap-aliases = false
|
||||
@@ -85,13 +85,13 @@ lines-between-types = 1
|
||||
no-lines-before = ["future", "standard-library"]
|
||||
relative-imports-order = "closest-to-furthest"
|
||||
|
||||
[flake8-quotes]
|
||||
[lint.flake8-quotes]
|
||||
avoid-escape = false
|
||||
multiline-quotes = "single"
|
||||
inline-quotes = "single"
|
||||
docstring-quotes = "double"
|
||||
|
||||
[extend-per-file-ignores]
|
||||
[lint.extend-per-file-ignores]
|
||||
"openllm-python/src/openllm/models/**" = ["E", "F", "I001"]
|
||||
"openllm-python/tests/**/*" = ["S101", "TID252", "PT011", "S307"]
|
||||
"openllm-python/src/openllm/_llm.py" = ["F811"]
|
||||
|
||||
Reference in New Issue
Block a user