mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-20 13:29:35 -05:00
chore(service): cleanup API (#579)
* chore(service): cleanup API Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: running tools Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix: tests import Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -67,6 +67,7 @@ _import_structure: dict[str, list[str]] = {
|
||||
'protocol': [],
|
||||
'utils': [],
|
||||
'_deprecated': ['Runner'],
|
||||
'_strategies': ['CascadingResourceStrategy', 'get_resource'],
|
||||
'entrypoints': ['mount_entrypoints'],
|
||||
'serialisation': ['ggml', 'transformers'],
|
||||
'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'],
|
||||
@@ -84,6 +85,8 @@ if _t.TYPE_CHECKING:
|
||||
from . import serialisation as serialisation
|
||||
from . import testing as testing
|
||||
from . import utils as utils
|
||||
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy
|
||||
from ._strategies import get_resource as get_resource
|
||||
from ._generation import LogitsProcessorList as LogitsProcessorList
|
||||
from ._generation import StopOnTokens as StopOnTokens
|
||||
from ._generation import StoppingCriteriaList as StoppingCriteriaList
|
||||
|
||||
@@ -19,7 +19,6 @@ from bentoml._internal.models.model import ModelSignature
|
||||
from bentoml._internal.runner.runner_handle import DummyRunnerHandle
|
||||
from openllm_core._schemas import CompletionChunk
|
||||
from openllm_core._schemas import GenerationOutput
|
||||
from ._strategies import CascadingResourceStrategy
|
||||
from openllm_core._typing_compat import AdapterMap
|
||||
from openllm_core._typing_compat import AdapterTuple
|
||||
from openllm_core._typing_compat import AdapterType
|
||||
@@ -51,6 +50,7 @@ from openllm_core.utils import resolve_filepath
|
||||
from openllm_core.utils import validate_is_path
|
||||
|
||||
from ._quantisation import infer_quantisation_config
|
||||
from ._strategies import CascadingResourceStrategy
|
||||
from .exceptions import ForbiddenAttributeError
|
||||
from .exceptions import OpenLLMException
|
||||
from .serialisation.constants import PEFT_CONFIG_NAME
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
import logging
|
||||
import os
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import _service_vars as svars
|
||||
import orjson
|
||||
@@ -11,47 +10,43 @@ import orjson
|
||||
import bentoml
|
||||
import openllm
|
||||
|
||||
# The following warnings from bitsandbytes, and probably not that important for users to see
|
||||
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization')
|
||||
warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization')
|
||||
warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.')
|
||||
from bentoml.io import JSON
|
||||
from bentoml.io import Text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
model = svars.model
|
||||
model_id = svars.model_id
|
||||
adapter_map = svars.adapter_map
|
||||
model_tag = svars.model_tag
|
||||
llm_config = openllm.AutoConfig.for_model(model)
|
||||
llm = openllm.LLM[t.Any, t.Any](model_id,
|
||||
llm_config = openllm.AutoConfig.for_model(svars.model)
|
||||
llm = openllm.LLM[t.Any, t.Any](svars.model_id,
|
||||
llm_config=llm_config,
|
||||
model_tag=model_tag,
|
||||
model_tag=svars.model_tag,
|
||||
prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)),
|
||||
system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)),
|
||||
serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']),
|
||||
adapter_map=orjson.loads(adapter_map))
|
||||
adapter_map=orjson.loads(svars.adapter_map))
|
||||
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])
|
||||
|
||||
llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)
|
||||
|
||||
@svc.api(route='/v1/generate', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.JSON.from_sample(openllm.GenerationOutput.examples().model_dump()))
|
||||
@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()))
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
return await llm.generate(**llm_model_class(**input_dict).model_dump())
|
||||
|
||||
@svc.api(route='/v1/generate_stream', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.Text(content_type='text/event-stream'))
|
||||
@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'))
|
||||
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
|
||||
async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()):
|
||||
yield f'data: {it.model_dump_json()}\n\n'
|
||||
yield 'data: [DONE]\n\n'
|
||||
|
||||
@svc.api(route='/v1/metadata', input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample(openllm.MetadataOutput.examples(llm).model_dump()))
|
||||
_Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'],
|
||||
model_name=llm_config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
model_id=llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
prompt_template=llm.runner.prompt_template,
|
||||
system_message=llm.runner.system_message)
|
||||
|
||||
@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump()))
|
||||
def metadata_v1(_: str) -> openllm.MetadataOutput:
|
||||
return openllm.MetadataOutput(timeout=llm_config['timeout'],
|
||||
model_name=llm_config['model_name'],
|
||||
backend=llm.__llm_backend__,
|
||||
model_id=llm.model_id,
|
||||
configuration=llm_config.model_dump_json().decode(),
|
||||
prompt_template=llm.runner.prompt_template,
|
||||
system_message=llm.runner.system_message)
|
||||
return _Metadata
|
||||
|
||||
openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema.
|
||||
|
||||
@@ -9,12 +9,13 @@ import types
|
||||
import typing as t
|
||||
import warnings
|
||||
|
||||
import psutil, bentoml
|
||||
import psutil
|
||||
|
||||
import bentoml
|
||||
|
||||
from bentoml._internal.resource import get_resource
|
||||
from bentoml._internal.resource import system_resources
|
||||
from bentoml._internal.runner.strategy import THREAD_ENVS
|
||||
|
||||
from openllm_core._typing_compat import overload
|
||||
from openllm_core.utils import DEBUG
|
||||
from openllm_core.utils import ReprMixin
|
||||
|
||||
Reference in New Issue
Block a user