chore(service): cleanup API (#579)

* chore(service): cleanup API Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: running tools Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: tests import Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-07 16:47:13 -05:00 · 2023-11-08 02:53:08 -05:00
parent 7398ae0486
commit ea42108e45
8 changed files with 70 additions and 71 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -67,6 +67,7 @@ _import_structure: dict[str, list[str]] = {
    'protocol': [],
    'utils': [],
    '_deprecated': ['Runner'],
+    '_strategies': ['CascadingResourceStrategy', 'get_resource'],
    'entrypoints': ['mount_entrypoints'],
    'serialisation': ['ggml', 'transformers'],
    'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'],
@@ -84,6 +85,8 @@ if _t.TYPE_CHECKING:
  from . import serialisation as serialisation
  from . import testing as testing
  from . import utils as utils
+  from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
+  from ._strategies import get_resource as get_resource
  from ._generation import LogitsProcessorList as LogitsProcessorList
  from ._generation import StopOnTokens as StopOnTokens
  from ._generation import StoppingCriteriaList as StoppingCriteriaList
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -19,7 +19,6 @@ from bentoml._internal.models.model import ModelSignature
 from bentoml._internal.runner.runner_handle import DummyRunnerHandle
 from openllm_core._schemas import CompletionChunk
 from openllm_core._schemas import GenerationOutput
-from ._strategies import CascadingResourceStrategy
 from openllm_core._typing_compat import AdapterMap
 from openllm_core._typing_compat import AdapterTuple
 from openllm_core._typing_compat import AdapterType
@@ -51,6 +50,7 @@ from openllm_core.utils import resolve_filepath
 from openllm_core.utils import validate_is_path

 from ._quantisation import infer_quantisation_config
+from ._strategies import CascadingResourceStrategy
 from .exceptions import ForbiddenAttributeError
 from .exceptions import OpenLLMException
 from .serialisation.constants import PEFT_CONFIG_NAME
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -3,7 +3,6 @@ from __future__ import annotations
 import logging
 import os
 import typing as t
-import warnings

 import _service_vars as svars
 import orjson
@@ -11,47 +10,43 @@ import orjson
 import bentoml
 import openllm

-# The following warnings from bitsandbytes, and probably not that important for users to see
-warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
-warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
-warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
+from bentoml.io import JSON
+from bentoml.io import Text

 logger = logging.getLogger(__name__)

-model = svars.model
-model_id = svars.model_id
-adapter_map = svars.adapter_map
-model_tag = svars.model_tag
-llm_config = openllm.AutoConfig.for_model(model)
-llm = openllm.LLM[t.Any, t.Any](model_id,
+llm_config = openllm.AutoConfig.for_model(svars.model)
+llm = openllm.LLM[t.Any, t.Any](svars.model_id,
                                llm_config=llm_config,
-                                model_tag=model_tag,
+                                model_tag=svars.model_tag,
                                prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)),
                                system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)),
                                serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
-                                adapter_map=orjson.loads(adapter_map))
+                                adapter_map=orjson.loads(svars.adapter_map))
 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])

 llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)

-@svc.api(route='/v1/generate', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.JSON.from_sample(openllm.GenerationOutput.examples().model_dump()))
+@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
  return await llm.generate(**llm_model_class(**input_dict).model_dump())

-@svc.api(route='/v1/generate_stream', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.Text(content_type='text/event-stream'))
+@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
 async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
  async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
    yield f'data: {it.model_dump_json()}\n\n'
  yield 'data: [DONE]\n\n'

-@svc.api(route='/v1/metadata', input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample(openllm.MetadataOutput.examples(llm).model_dump()))
+_Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'],
+                                   model_name=llm_config['model_name'],
+                                   backend=llm.__llm_backend__,
+                                   model_id=llm.model_id,
+                                   configuration=llm_config.model_dump_json().decode(),
+                                   prompt_template=llm.runner.prompt_template,
+                                   system_message=llm.runner.system_message)
+
+@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
 def metadata_v1(_: str) -> openllm.MetadataOutput:
-  return openllm.MetadataOutput(timeout=llm_config['timeout'],
-                                model_name=llm_config['model_name'],
-                                backend=llm.__llm_backend__,
-                                model_id=llm.model_id,
-                                configuration=llm_config.model_dump_json().decode(),
-                                prompt_template=llm.runner.prompt_template,
-                                system_message=llm.runner.system_message)
+  return _Metadata

 openllm.mount_entrypoints(svc, llm)  # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
--- a/openllm-python/src/openllm/_strategies.py
+++ b/openllm-python/src/openllm/_strategies.py
@@ -9,12 +9,13 @@ import types
 import typing as t
 import warnings

-import psutil, bentoml
+import psutil
+
+import bentoml

 from bentoml._internal.resource import get_resource
 from bentoml._internal.resource import system_resources
 from bentoml._internal.runner.strategy import THREAD_ENVS
-
 from openllm_core._typing_compat import overload
 from openllm_core.utils import DEBUG
 from openllm_core.utils import ReprMixin