diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index d993b5e5..82a12a86 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -1,7 +1,6 @@ # mypy: disable-error-code="attr-defined,no-untyped-call,type-var,operator,arg-type,no-redef,misc" from __future__ import annotations import copy -import enum import logging import os import sys @@ -39,7 +38,6 @@ from ._typing_compat import Required from ._typing_compat import Self from ._typing_compat import overload from .exceptions import ForbiddenAttributeError -from .utils import MYPY from .utils import LazyLoader from .utils import ReprMixin from .utils import codegen @@ -48,8 +46,10 @@ from .utils import dantic from .utils import field_env_key from .utils import first_not_none from .utils import lenient_issubclass -from .utils.peft import FineTuneConfig, PEFT_TASK_TYPE_TARGET_MAPPING, PeftType from .utils.import_utils import is_vllm_available +from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING +from .utils.peft import FineTuneConfig +from .utils.peft import PeftType if t.TYPE_CHECKING: import click diff --git a/openllm-core/src/openllm_core/_schemas.py b/openllm-core/src/openllm_core/_schemas.py index 75784ffe..aa5d630c 100644 --- a/openllm-core/src/openllm_core/_schemas.py +++ b/openllm-core/src/openllm_core/_schemas.py @@ -14,11 +14,6 @@ from .utils import gen_random_uuid if t.TYPE_CHECKING: import vllm - import openllm - - from ._typing_compat import M - from ._typing_compat import T - @attr.frozen(slots=True) class MetadataOutput: model_id: str @@ -29,20 +24,11 @@ class MetadataOutput: prompt_template: str system_message: str - @classmethod - def examples(cls, llm: openllm.LLM[M, T]) -> MetadataOutput: - return cls(model_id=llm.model_id, - timeout=llm.config['timeout'], - model_name=llm.config['model_name'], - backend=llm.__llm_backend__, - configuration=llm.config.model_dump_json().decode(), - prompt_template='{system_message}', - system_message='You are a helpful assistant.') + def model_dump(self) -> dict[str, t.Any]: + return converter.unstructure(self) - # yapf: disable - def model_dump(self)->dict[str, t.Any]: return converter.unstructure(self) - def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8') - # yapf: enable + def model_dump_json(self) -> str: + return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8') @attr.define(slots=True, frozen=True) class GenerationInput: @@ -51,12 +37,15 @@ class GenerationInput: stop: list[str] | None = attr.field(default=None) adapter_name: str | None = attr.field(default=None) - # yapf: disable @classmethod - def from_model(cls,model_name:str,**attrs: t.Any)->type[GenerationInput]:return cls.from_llm_config(AutoConfig.for_model(model_name,**attrs)) - def model_dump(self)->dict[str,t.Any]:return {'prompt': self.prompt,'stop': self.stop,'llm_config': self.llm_config.model_dump(flatten=True),'adapter_name': self.adapter_name} - def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_INDENT_2).decode('utf-8') - # yapf: enable + def from_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: + return cls.from_llm_config(AutoConfig.for_model(model_name, **attrs)) + + def model_dump(self) -> dict[str, t.Any]: + return {'prompt': self.prompt, 'stop': self.stop, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name} + + def model_dump_json(self) -> str: + return orjson.dumps(self.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8') @classmethod def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: @@ -78,8 +67,8 @@ class GenerationInput: repr=True, collect_by_mro=True) - def examples(_: type[GenerationInput]) -> GenerationInput: - return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n']) + def examples(_: type[GenerationInput]) -> dict[str, t.Any]: + return klass(prompt='What is the meaning of life?', llm_config=llm_config, stop=['\n']).model_dump() setattr(klass, 'examples', classmethod(examples)) @@ -121,7 +110,7 @@ class GenerationOutput: request_id: str = attr.field(factory=lambda: gen_random_uuid()) @classmethod - def examples(cls) -> GenerationOutput: + def examples(cls) -> dict[str, t.Any]: return cls(prompt='What is the meaning of life?', finished=True, outputs=[ @@ -134,7 +123,7 @@ class GenerationOutput: ], prompt_token_ids=[2, 2264, 16, 5, 3099, 9, 301, 116], prompt_logprobs=None, - request_id=gen_random_uuid()) + request_id=gen_random_uuid()).model_dump() @staticmethod def _preprocess_sse_message(data: str) -> str: @@ -164,8 +153,11 @@ class GenerationOutput: prompt_token_ids=request_output.prompt_token_ids, prompt_logprobs=request_output.prompt_logprobs) - # yapf: disable - def with_options(self,**options: t.Any)->GenerationOutput: return attr.evolve(self, **options) - def model_dump(self)->dict[str, t.Any]:return converter.unstructure(self) - def model_dump_json(self)->str:return orjson.dumps(self.model_dump(),option=orjson.OPT_NON_STR_KEYS).decode('utf-8') - # yapf: enable + def with_options(self, **options: t.Any) -> GenerationOutput: + return attr.evolve(self, **options) + + def model_dump(self) -> dict[str, t.Any]: + return converter.unstructure(self) + + def model_dump_json(self) -> str: + return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8') diff --git a/openllm-core/src/openllm_core/utils/peft.py b/openllm-core/src/openllm_core/utils/peft.py index 2c1f51ee..48a8eca5 100644 --- a/openllm-core/src/openllm_core/utils/peft.py +++ b/openllm-core/src/openllm_core/utils/peft.py @@ -1,6 +1,12 @@ from __future__ import annotations -import enum, typing as t, inflection, attr +import enum +import typing as t + +import attr +import inflection + from deepmerge import Merger + from . import dantic from ..exceptions import ForbiddenAttributeError @@ -8,8 +14,9 @@ config_merger = Merger([(dict, 'merge')], ['override'], ['override']) if t.TYPE_CHECKING: from peft.config import PeftConfig - from .._typing_compat import AdapterType + from .._configuration import LLMConfig + from .._typing_compat import AdapterType # case insensitive, but rename to conform with type class _PeftEnumMeta(enum.EnumMeta): @@ -72,7 +79,8 @@ class FineTuneConfig: def build(self) -> PeftConfig: try: - from peft import TaskType, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config + from peft.utils.peft_types import TaskType + from peft.mapping import get_peft_config except ImportError: raise ImportError('PEFT is not installed. Please install it via `pip install "openllm[fine-tune]"`.') adapter_config = self.adapter_config.copy() @@ -83,7 +91,7 @@ class FineTuneConfig: # respect user set task_type if it is passed, otherwise use one managed by OpenLLM inference_mode = adapter_config.pop('inference_mode', self.inference_mode) task_type = adapter_config.pop('task_type', TaskType[self.llm_config_class.peft_task_type()]) - adapter_config = {'peft_type': self.adapter_type.value, "task_type": task_type, "inference_mode": inference_mode, **adapter_config} + adapter_config = {'peft_type': self.adapter_type.value, 'task_type': task_type, 'inference_mode': inference_mode, **adapter_config} return get_peft_config(adapter_config) def train(self) -> FineTuneConfig: diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 200524e0..0447e5fb 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -67,6 +67,7 @@ _import_structure: dict[str, list[str]] = { 'protocol': [], 'utils': [], '_deprecated': ['Runner'], + '_strategies': ['CascadingResourceStrategy', 'get_resource'], 'entrypoints': ['mount_entrypoints'], 'serialisation': ['ggml', 'transformers'], 'cli._sdk': ['start', 'start_grpc', 'build', 'import_model', 'list_models'], @@ -84,6 +85,8 @@ if _t.TYPE_CHECKING: from . import serialisation as serialisation from . import testing as testing from . import utils as utils + from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy + from ._strategies import get_resource as get_resource from ._generation import LogitsProcessorList as LogitsProcessorList from ._generation import StopOnTokens as StopOnTokens from ._generation import StoppingCriteriaList as StoppingCriteriaList diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 824e972c..c22161b4 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -19,7 +19,6 @@ from bentoml._internal.models.model import ModelSignature from bentoml._internal.runner.runner_handle import DummyRunnerHandle from openllm_core._schemas import CompletionChunk from openllm_core._schemas import GenerationOutput -from ._strategies import CascadingResourceStrategy from openllm_core._typing_compat import AdapterMap from openllm_core._typing_compat import AdapterTuple from openllm_core._typing_compat import AdapterType @@ -51,6 +50,7 @@ from openllm_core.utils import resolve_filepath from openllm_core.utils import validate_is_path from ._quantisation import infer_quantisation_config +from ._strategies import CascadingResourceStrategy from .exceptions import ForbiddenAttributeError from .exceptions import OpenLLMException from .serialisation.constants import PEFT_CONFIG_NAME diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 003ea69f..4e532fd2 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging import os import typing as t -import warnings import _service_vars as svars import orjson @@ -11,47 +10,43 @@ import orjson import bentoml import openllm -# The following warnings from bitsandbytes, and probably not that important for users to see -warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization') -warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization') -warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.') +from bentoml.io import JSON +from bentoml.io import Text logger = logging.getLogger(__name__) -model = svars.model -model_id = svars.model_id -adapter_map = svars.adapter_map -model_tag = svars.model_tag -llm_config = openllm.AutoConfig.for_model(model) -llm = openllm.LLM[t.Any, t.Any](model_id, +llm_config = openllm.AutoConfig.for_model(svars.model) +llm = openllm.LLM[t.Any, t.Any](svars.model_id, llm_config=llm_config, - model_tag=model_tag, + model_tag=svars.model_tag, prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), getattr(llm_config, 'default_prompt_template', None)), system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), getattr(llm_config, 'default_system_message', None)), serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), default=llm_config['serialisation']), - adapter_map=orjson.loads(adapter_map)) + adapter_map=orjson.loads(svars.adapter_map)) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner]) llm_model_class = openllm.GenerationInput.from_llm_config(llm_config) -@svc.api(route='/v1/generate', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.JSON.from_sample(openllm.GenerationOutput.examples().model_dump())) +@svc.api(route='/v1/generate', input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples())) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: return await llm.generate(**llm_model_class(**input_dict).model_dump()) -@svc.api(route='/v1/generate_stream', input=bentoml.io.JSON.from_sample(llm_model_class.examples().model_dump()), output=bentoml.io.Text(content_type='text/event-stream')) +@svc.api(route='/v1/generate_stream', input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream')) async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): yield f'data: {it.model_dump_json()}\n\n' yield 'data: [DONE]\n\n' -@svc.api(route='/v1/metadata', input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample(openllm.MetadataOutput.examples(llm).model_dump())) +_Metadata = openllm.MetadataOutput(timeout=llm_config['timeout'], + model_name=llm_config['model_name'], + backend=llm.__llm_backend__, + model_id=llm.model_id, + configuration=llm_config.model_dump_json().decode(), + prompt_template=llm.runner.prompt_template, + system_message=llm.runner.system_message) + +@svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump())) def metadata_v1(_: str) -> openllm.MetadataOutput: - return openllm.MetadataOutput(timeout=llm_config['timeout'], - model_name=llm_config['model_name'], - backend=llm.__llm_backend__, - model_id=llm.model_id, - configuration=llm_config.model_dump_json().decode(), - prompt_template=llm.runner.prompt_template, - system_message=llm.runner.system_message) + return _Metadata openllm.mount_entrypoints(svc, llm) # HACK: This must always be the last line in this file, as we will do some MK for OpenAPI schema. diff --git a/openllm-python/src/openllm/_strategies.py b/openllm-python/src/openllm/_strategies.py index 1dcb9c59..a3b19e52 100644 --- a/openllm-python/src/openllm/_strategies.py +++ b/openllm-python/src/openllm/_strategies.py @@ -9,12 +9,13 @@ import types import typing as t import warnings -import psutil, bentoml +import psutil + +import bentoml from bentoml._internal.resource import get_resource from bentoml._internal.resource import system_resources from bentoml._internal.runner.strategy import THREAD_ENVS - from openllm_core._typing_compat import overload from openllm_core.utils import DEBUG from openllm_core.utils import ReprMixin diff --git a/ruff.toml b/ruff.toml index f878776c..2848eda1 100644 --- a/ruff.toml +++ b/ruff.toml @@ -43,7 +43,7 @@ target-version = "py312" typing-modules = ["openllm_core._typing_compat"] unfixable = ["TCH004"] -[flake8-type-checking] +[lint.flake8-type-checking] exempt-modules = ["typing", "typing_extensions", "openllm_core._typing_compat"] runtime-evaluated-base-classes = [ "openllm_core._configuration.LLMConfig", @@ -59,13 +59,13 @@ quote-style = "single" indent-style = "space" skip-magic-trailing-comma = true -[pydocstyle] +[lint.pydocstyle] convention = "google" -[pycodestyle] +[lint.pycodestyle] ignore-overlong-task-comments = true -[isort] +[lint.isort] combine-as-imports = true force-single-line = true force-wrap-aliases = false @@ -85,13 +85,13 @@ lines-between-types = 1 no-lines-before = ["future", "standard-library"] relative-imports-order = "closest-to-furthest" -[flake8-quotes] +[lint.flake8-quotes] avoid-escape = false multiline-quotes = "single" inline-quotes = "single" docstring-quotes = "double" -[extend-per-file-ignores] +[lint.extend-per-file-ignores] "openllm-python/src/openllm/models/**" = ["E", "F", "I001"] "openllm-python/tests/**/*" = ["S101", "TID252", "PT011", "S307"] "openllm-python/src/openllm/_llm.py" = ["F811"]