chore(service): cleanup API (#579)

* chore(service): cleanup API Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: running tools Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: tests import Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-05-24 16:44:39 -04:00 · 2023-11-08 02:53:08 -05:00
parent 7398ae0486
commit ea42108e45
8 changed files with 70 additions and 71 deletions
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -1,7 +1,6 @@
 # mypy: disable-error-code="attr-defined,no-untyped-call,type-var,operator,arg-type,no-redef,misc"
 from __future__ import annotations
 import copy
-import enum
 import logging
 import os
 import sys
@@ -39,7 +38,6 @@ from ._typing_compat import Required
 from ._typing_compat import Self
 from ._typing_compat import overload
 from .exceptions import ForbiddenAttributeError
-from .utils import MYPY
 from .utils import LazyLoader
 from .utils import ReprMixin
 from .utils import codegen
@@ -48,8 +46,10 @@ from .utils import dantic
 from .utils import field_env_key
 from .utils import first_not_none
 from .utils import lenient_issubclass
-from .utils.peft import FineTuneConfig, PEFT_TASK_TYPE_TARGET_MAPPING, PeftType
 from .utils.import_utils import is_vllm_available
+from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING
+from .utils.peft import FineTuneConfig
+from .utils.peft import PeftType

 if t.TYPE_CHECKING:
  import click
--- a/openllm-core/src/openllm_core/_schemas.py
+++ b/openllm-core/src/openllm_core/_schemas.py
@@ -14,11 +14,6 @@ from .utils import gen_random_uuid
 if t.TYPE_CHECKING:
  import vllm

-  import openllm
-
-  from ._typing_compat import M
-  from ._typing_compat import T
-
@attr.frozen(slots=True)
 class MetadataOutput:
  model_id: str
@@ -29,20 +24,11 @@ class MetadataOutput:
  prompt_template: str
  system_message: str

-  @classmethod
-  def examples(cls, llm: openllm.LLM[M, T]) -> MetadataOutput:
-    return cls(model_id=llm.model_id,
-               timeout=llm.config['timeout'],
-               model_name=llm.config['model_name'],
-               backend=llm.__llm_backend__,
-               configuration=llm.config.model_dump_json().decode(),
-               prompt_template='{system_message}',
-               system_message='You are a helpful assistant.')
+  def model_dump(self) -> dict[str, t.Any]:
+    return converter.unstructure(self)

-  # yapf: disable
-  def model_dump(self)->dict[str, t.Any]: return converter.unstructure(self)
-  def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8')
-  # yapf: enable
+  def model_dump_json(self) -> str:
+    return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')

@attr.define(slots=True, frozen=True)
 class GenerationInput:
@@ -51,12 +37,15 @@ class GenerationInput:
  stop: list[str] | None = attr.field(default=None)
  adapter_name: str | None = attr.field(default=None)

-  # yapf: disable
  @classmethod
-  def from_model(cls,model_name:str,**attrs: t.Any)->type[GenerationInput]:return cls.from_llm_config(AutoConfig.for_model(model_name,**attrs))
-  def model_dump(self)->dict[str,t.Any]:return {'prompt': self.prompt,'stop': self.stop,'llm_config': self.llm_config.model_dump(flatten=True),'adapter_name': self.adapter_name}
-  def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8')
-  # yapf: enable
+  def from_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
+    return cls.from_llm_config(AutoConfig.for_model(model_name, **attrs))
+
+  def model_dump(self) -> dict[str, t.Any]:
+    return {'prompt': self.prompt, 'stop': self.stop, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name}
+
+  def model_dump_json(self) -> str:
+    return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')

  @classmethod
  def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]:
@@ -78,8 +67,8 @@ class GenerationInput:
                                                   repr=True,
                                                   collect_by_mro=True)

-    def examples(_: type[GenerationInput]) -> GenerationInput:
-      return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n'])
+    def examples(_: type[GenerationInput]) -> dict[str, t.Any]:
+      return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n']).model_dump()

    setattr(klass, 'examples', classmethod(examples))

@@ -121,7 +110,7 @@ class GenerationOutput:
  request_id: str = attr.field(factory=lambda: gen_random_uuid())

  @classmethod
-  def examples(cls) -> GenerationOutput:
+  def examples(cls) -> dict[str, t.Any]:
    return cls(prompt='What is the meaning of life?',
               finished=True,
               outputs=[
@@ -134,7 +123,7 @@ class GenerationOutput:
               ],
               prompt_token_ids=[2, 2264, 16, 5, 3099, 9, 301, 116],
               prompt_logprobs=None,
-               request_id=gen_random_uuid())
+               request_id=gen_random_uuid()).model_dump()

  @staticmethod
  def _preprocess_sse_message(data: str) -> str:
@@ -164,8 +153,11 @@ class GenerationOutput:
               prompt_token_ids=request_output.prompt_token_ids,
               prompt_logprobs=request_output.prompt_logprobs)

-  # yapf: disable
-  def with_options(self,**options: t.Any)->GenerationOutput: return attr.evolve(self, **options)
-  def model_dump(self)->dict[str, t.Any]:return converter.unstructure(self)
-  def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
-  # yapf: enable
+  def with_options(self, **options: t.Any) -> GenerationOutput:
+    return attr.evolve(self, **options)
+
+  def model_dump(self) -> dict[str, t.Any]:
+    return converter.unstructure(self)
+
+  def model_dump_json(self) -> str:
+    return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
--- a/openllm-core/src/openllm_core/utils/peft.py
+++ b/openllm-core/src/openllm_core/utils/peft.py
@@ -1,6 +1,12 @@
 from __future__ import annotations
-import enum, typing as t, inflection, attr
+import enum
+import typing as t
+
+import attr
+import inflection
+
 from deepmerge import Merger
+
 from . import dantic
 from ..exceptions import ForbiddenAttributeError

@@ -8,8 +14,9 @@ config_merger = Merger([(dict, 'merge')], ['override'], ['override'])

 if t.TYPE_CHECKING:
  from peft.config import PeftConfig
-  from .._typing_compat import AdapterType
+
  from .._configuration import LLMConfig
+  from .._typing_compat import AdapterType

 # case insensitive, but rename to conform with type
 class _PeftEnumMeta(enum.EnumMeta):
@@ -72,7 +79,8 @@ class FineTuneConfig:

  def build(self) -> PeftConfig:
    try:
-      from peft import TaskType, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config
+      from peft.utils.peft_types import TaskType
+      from peft.mapping import get_peft_config
    except ImportError:
      raise ImportError('PEFT is not installed. Please install it via `pip install "openllm[fine-tune]"`.')
    adapter_config = self.adapter_config.copy()
@@ -83,7 +91,7 @@ class FineTuneConfig:
    # respect user set task_type if it is passed, otherwise use one managed by OpenLLM
    inference_mode = adapter_config.pop('inference_mode', self.inference_mode)
    task_type = adapter_config.pop('task_type', TaskType[self.llm_config_class.peft_task_type()])
-    adapter_config = {'peft_type': self.adapter_type.value, "task_type": task_type, "inference_mode": inference_mode, **adapter_config}
+    adapter_config = {'peft_type': self.adapter_type.value, 'task_type': task_type, 'inference_mode': inference_mode, **adapter_config}
    return get_peft_config(adapter_config)

  def train(self) -> FineTuneConfig:
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -67,6 +67,7 @@ _import_structure: dict[str, list[str]] = {
    'protocol': [],
    'utils': [],
    '_deprecated': ['Runner'],
+    '_strategies': ['CascadingResourceStrategy', 'get_resource'],
    'entrypoints': ['mount_entrypoints'],
    'serialisation': ['ggml', 'transformers'],
    'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'],
@@ -84,6 +85,8 @@ if _t.TYPE_CHECKING:
  from . import serialisation as serialisation
  from . import testing as testing
  from . import utils as utils
+  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
+  from ._strategies import get_resource as get_resource
  from ._generation import LogitsProcessorList as LogitsProcessorList
  from ._generation import StopOnTokens as StopOnTokens
  from ._generation import StoppingCriteriaList as StoppingCriteriaList
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -19,7 +19,6 @@ from bentoml._internal.models.model import ModelSignature
 from bentoml._internal.runner.runner_handle import DummyRunnerHandle
 from openllm_core._schemas import CompletionChunk
 from openllm_core._schemas import GenerationOutput
-from ._strategies import CascadingResourceStrategy
 from openllm_core._typing_compat import AdapterMap
 from openllm_core._typing_compat import AdapterTuple
 from openllm_core._typing_compat import AdapterType
@@ -51,6 +50,7 @@ from openllm_core.utils import resolve_filepath
 from openllm_core.utils import validate_is_path

 from ._quantisation import infer_quantisation_config
+from ._strategies import CascadingResourceStrategy
 from .exceptions import ForbiddenAttributeError
 from .exceptions import OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -3,7 +3,6 @@ from __future__ import annotations
 import logging
 import os
 import typing as t
-import warnings

 import _service_vars as svars
 import orjson
@@ -11,47 +10,43 @@ import orjson
 import bentoml
 import openllm

-# The following warnings from bitsandbytes, and probably not that important for users to see
-warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
-warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
-warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
+from bentoml.io import JSON
+from bentoml.io import Text

 logger = logging.getLogger(__name__)

-model = svars.model
-model_id = svars.model_id
-adapter_map = svars.adapter_map
-model_tag = svars.model_tag
-llm_config = openllm.AutoConfig.for_model(model)
-llm = openllm.LLM[t.Any, t.Any](model_id,
+llm_config = openllm.AutoConfig.for_model(svars.model)
+llm = openllm.LLM[t.Any, t.Any](svars.model_id,
                                llm_config=llm_config,
-                                model_tag=model_tag,
+                                model_tag=svars.model_tag,
                                prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)),
                                system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)),
                                serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
-                                adapter_map=orjson.loads(adapter_map))
+                                adapter_map=orjson.loads(svars.adapter_map))
 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])

 llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)

-@svc.api(route='/v1/generate', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.JSON.from_sample(openllm.GenerationOutput.examples().model_dump()))
+@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
  return await llm.generate(**llm_model_class(**input_dict).model_dump())

-@svc.api(route='/v1/generate_stream', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.Text(content_type='text/event-stream'))
+@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
    yield f'data: {it.model_dump_json()}\n\n'
  yield 'data: [DONE]\n\n'

-@svc.api(route='/v1/metadata', input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample(openllm.MetadataOutput.examples(llm).model_dump()))
+_Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'],
+                                   model_name=llm_config['model_name'],
+                                   backend=llm.__llm_backend__,
+                                   model_id=llm.model_id,
+                                   configuration=llm_config.model_dump_json().decode(),
+                                   prompt_template=llm.runner.prompt_template,
+                                   system_message=llm.runner.system_message)
+
+@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
 def metadata_v1(_: str) -> openllm.MetadataOutput:
-  return openllm.MetadataOutput(timeout=llm_config['timeout'],
-                                model_name=llm_config['model_name'],
-                                backend=llm.__llm_backend__,
-                                model_id=llm.model_id,
-                                configuration=llm_config.model_dump_json().decode(),
-                                prompt_template=llm.runner.prompt_template,
-                                system_message=llm.runner.system_message)
+  return _Metadata

 openllm.mount_entrypoints(svc, llm)  # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -9,12 +9,13 @@ import types
 import typing as t
 import warnings

-import psutil, bentoml
+import psutil
+
+import bentoml

 from bentoml._internal.resource import get_resource
 from bentoml._internal.resource import system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
-
 from openllm_core._typing_compat import overload
 from openllm_core.utils import DEBUG
 from openllm_core.utils import ReprMixin
--- a/ruff.toml
+++ b/ruff.toml
@@ -43,7 +43,7 @@ target-version = "py312"
 typing-modules = ["openllm_core._typing_compat"]
 unfixable = ["TCH004"]

-[flake8-type-checking]
+[lint.flake8-type-checking]
 exempt-modules = ["typing", "typing_extensions", "openllm_core._typing_compat"]
 runtime-evaluated-base-classes = [
  "openllm_core._configuration.LLMConfig",
@@ -59,13 +59,13 @@ quote-style = "single"
 indent-style = "space"
 skip-magic-trailing-comma = true

-[pydocstyle]
+[lint.pydocstyle]
 convention = "google"

-[pycodestyle]
+[lint.pycodestyle]
 ignore-overlong-task-comments = true

-[isort]
+[lint.isort]
 combine-as-imports = true
 force-single-line = true
 force-wrap-aliases = false
@@ -85,13 +85,13 @@ lines-between-types = 1
 no-lines-before = ["future", "standard-library"]
 relative-imports-order = "closest-to-furthest"

-[flake8-quotes]
+[lint.flake8-quotes]
 avoid-escape = false
 multiline-quotes = "single"
 inline-quotes = "single"
 docstring-quotes = "double"

-[extend-per-file-ignores]
+[lint.extend-per-file-ignores]
 "openllm-python/src/openllm/models/**" = ["E", "F", "I001"]
 "openllm-python/tests/**/*" = ["S101", "TID252", "PT011", "S307"]
 "openllm-python/src/openllm/_llm.py" = ["F811"]