diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 953108e3..7f7a7faa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ ci: autoupdate_schedule: weekly - skip: [check-models-table-update, changelog-dry-run, pyright, yapf, mypy, sync-readme, clj-kondo] + skip: [check-models-table-update, changelog-dry-run, pyright, mypy, sync-readme, clj-kondo] autofix_commit_msg: "ci: auto fixes from pre-commit.ci\n\nFor more information, see https://pre-commit.ci" autoupdate_commit_msg: 'ci: pre-commit autoupdate [pre-commit.ci]' default_language_version: @@ -26,6 +26,11 @@ repos: types: [python] exclude: ^(docs|tools|openllm-python/tests) args: [--config=pyproject.toml] + - repo: https://github.com/google/yapf + rev: v0.40.1 + hooks: + - id: yapf + verbose: true - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.5.0 hooks: @@ -71,16 +76,16 @@ repos: - id: check-yaml args: ['--unsafe'] - id: check-toml + - id: check-docstring-first + - id: check-added-large-files + - id: debug-statements + - id: check-merge-conflict + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes - repo: local hooks: - - id: yapf - verbose: true - name: yapf - entry: ./tools/yapf - types: [python] - language: python - pass_filenames: false - additional_dependencies: ['yapf==0.40.0'] - id: check-models-table-update name: check if table in README.md is up-to-date entry: ./tools/assert-model-table-latest.py diff --git a/hatch.toml b/hatch.toml index 3146b9d0..b73fcf77 100644 --- a/hatch.toml +++ b/hatch.toml @@ -32,6 +32,7 @@ inplace-changelog = "towncrier build --version main --keep" quality = [ "./tools/dependencies.py", "./tools/update-readme.py", + "- ./tools/yapf", "- ./tools/update-brew-tap.py", "bash ./tools/sync-readme.sh", "check-stubs", diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py index 5a4b90c0..c8c7a596 100644 --- a/openllm-client/src/openllm_client/_base.py +++ b/openllm-client/src/openllm_client/_base.py @@ -9,43 +9,62 @@ from openllm_core._typing_compat import overload, LiteralString if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralRuntime, DictStrAny import transformers - logger = logging.getLogger(__name__) - @attr.define(slots=False, init=False) class _ClientAttr: _address: str _timeout: float = attr.field(default=30) _api_version: str = attr.field(default="v1") - def __init__(self, address: str, timeout: float = 30, api_version: str = "v1"): self.__attrs_init__(address, timeout, api_version) + + def __init__(self, address: str, timeout: float = 30, api_version: str = "v1"): + self.__attrs_init__(address, timeout, api_version) @abc.abstractmethod - def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: raise NotImplementedError + def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: + raise NotImplementedError + @abc.abstractmethod - def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: raise NotImplementedError + def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: + raise NotImplementedError + @overload @abc.abstractmethod - def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ... + def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: + ... + @overload @abc.abstractmethod - def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ... + def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: + ... + @overload @abc.abstractmethod - def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ... + def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: + ... + @abc.abstractmethod - def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: raise NotImplementedError + def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: + raise NotImplementedError + # NOTE: Scikit interface @overload @abc.abstractmethod - def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ... + def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: + ... + @overload @abc.abstractmethod - def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ... + def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: + ... + @overload @abc.abstractmethod - def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ... + def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: + ... + @abc.abstractmethod - def predict(self, prompt: str, **attrs: t.Any) -> t.Any: raise NotImplementedError + def predict(self, prompt: str, **attrs: t.Any) -> t.Any: + raise NotImplementedError @functools.cached_property def _hf_agent(self) -> transformers.HfAgent: @@ -54,46 +73,77 @@ class _ClientAttr: if not is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'") import transformers return transformers.HfAgent(urljoin(self._address, "/hf/agent")) + @property - def _metadata(self) -> t.Any: return self.call("metadata") + def _metadata(self) -> t.Any: + return self.call("metadata") + @property def model_name(self) -> str: - try: return self._metadata["model_name"] - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + try: + return self._metadata["model_name"] + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property def model_id(self) -> str: - try: return self._metadata["model_id"] - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + try: + return self._metadata["model_id"] + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property def framework(self) -> LiteralRuntime: - try: return self._metadata["framework"] - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + try: + return self._metadata["framework"] + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property def timeout(self) -> int: - try: return self._metadata["timeout"] - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + try: + return self._metadata["timeout"] + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property def configuration(self) -> dict[str, t.Any]: - try: return orjson.loads(self._metadata["configuration"]) - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + try: + return orjson.loads(self._metadata["configuration"]) + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property def supports_embeddings(self) -> bool: - try: return self._metadata.get("supports_embeddings", False) - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + try: + return self._metadata.get("supports_embeddings", False) + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property def supports_hf_agent(self) -> bool: - try: return self._metadata.get("supports_hf_agent", False) - except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None - @property - def config(self) -> openllm_core.LLMConfig: return openllm_core.AutoConfig.for_model(self.model_name).model_construct_env(**self.configuration) - @functools.cached_property - def inner(self) -> t.Any: raise NotImplementedError("'inner' client is not implemented.") + try: + return self._metadata.get("supports_hf_agent", False) + except KeyError: + raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None + @property + def config(self) -> openllm_core.LLMConfig: + return openllm_core.AutoConfig.for_model(self.model_name).model_construct_env(**self.configuration) + + @functools.cached_property + def inner(self) -> t.Any: + raise NotImplementedError("'inner' client is not implemented.") class _Client(_ClientAttr): _host: str _port: str - def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs) - def health(self) -> t.Any: return self.inner.health() + + def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: + return self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs) + + def health(self) -> t.Any: + return self.inner.health() + @functools.cached_property def inner(self) -> BentoClient: BentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout) @@ -103,22 +153,30 @@ class _Client(_ClientAttr): def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any: if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs) else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'") + def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: if len(args) > 1: raise ValueError("'args' should only take one positional argument.") task = kwargs.pop("task", args[0]) return_code = kwargs.pop("return_code", False) remote = kwargs.pop("remote", False) - try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs) + try: + return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs) except Exception as err: logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err) logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address) - class _AsyncClient(_ClientAttr): _host: str _port: str - def __init__(self, address: str, timeout: float = 30): self._address,self._timeout = address,timeout - async def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return await self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs) - async def health(self) -> t.Any: return await self.inner.health() + + def __init__(self, address: str, timeout: float = 30): + self._address, self._timeout = address, timeout + + async def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: + return await self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs) + + async def health(self) -> t.Any: + return await self.inner.health() + @functools.cached_property def inner(self) -> AsyncBentoClient: ensure_exec_coro(AsyncBentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)) @@ -129,6 +187,7 @@ class _AsyncClient(_ClientAttr): """Async version of agent.run.""" if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs) else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'") + async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: if not is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0") if len(args) > 1: raise ValueError("'args' should only take one positional argument.") @@ -161,11 +220,16 @@ class _AsyncClient(_ClientAttr): else: tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote) return f"{tool_code}\n{code}" - class BaseClient(_Client): - def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError - def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt))) - def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str: return self.query(prompt, **attrs) + def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: + raise NotImplementedError + + def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: + return openllm_core.EmbeddingsOutput(**self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt))) + + def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str: + return self.query(prompt, **attrs) + def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: return_raw_response = attrs.pop("return_raw_response", None) if return_raw_response is not None: @@ -181,11 +245,16 @@ class BaseClient(_Client): if return_response == "attrs": return r elif return_response == "raw": return bentoml_cattr.unstructure(r) else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs) - class BaseAsyncClient(_AsyncClient): - async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError - async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**(await self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt)))) - async def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return await self.query(prompt, **attrs) + async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: + raise NotImplementedError + + async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: + return openllm_core.EmbeddingsOutput(**(await self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt)))) + + async def predict(self, prompt: str, **attrs: t.Any) -> t.Any: + return await self.query(prompt, **attrs) + async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: return_raw_response = attrs.pop("return_raw_response", None) if return_raw_response is not None: diff --git a/openllm-client/src/openllm_client/benmin/__init__.py b/openllm-client/src/openllm_client/benmin/__init__.py index 885b7648..0df25044 100644 --- a/openllm-client/src/openllm_client/benmin/__init__.py +++ b/openllm-client/src/openllm_client/benmin/__init__.py @@ -17,22 +17,30 @@ from abc import abstractmethod if t.TYPE_CHECKING: from bentoml._internal.service.inference_api import InferenceAPI __all__ = ["Client", "AsyncClient"] - @attr.define(init=False) class Client: server_url: str endpoints: t.List[str] svc: bentoml.Service timeout: int = attr.field(default=30) + def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None: if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.") self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc) - for it, val in kwargs.items(): object.__setattr__(self, it, val) - def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs) + for it, val in kwargs.items(): + object.__setattr__(self, it, val) + + def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: + return self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs) + @abstractmethod - def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError + def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: + raise NotImplementedError + @abstractmethod - def health(self) -> t.Any: raise NotImplementedError + def health(self) -> t.Any: + raise NotImplementedError + @classmethod def from_url(cls, url: str, **kwargs: t.Any) -> Client: try: @@ -41,7 +49,9 @@ class Client: except httpx.RemoteProtocolError: from ._grpc import GrpcClient return GrpcClient.from_url(url, **kwargs) - except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err + except Exception as err: + raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err + @staticmethod def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None: try: @@ -50,23 +60,32 @@ class Client: except httpx.RemoteProtocolError: from ._grpc import GrpcClient return GrpcClient.wait_until_server_ready(host, port, timeout, **kwargs) - except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err - + except Exception as err: + raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err @attr.define(init=False) class AsyncClient: server_url: str endpoints: t.List[str] svc: bentoml.Service timeout: int = attr.field(default=30) + def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None: if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.") self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc) - for it, val in kwargs.items(): object.__setattr__(self, it, val) - async def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return await self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs) + for it, val in kwargs.items(): + object.__setattr__(self, it, val) + + async def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: + return await self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs) + @abstractmethod - async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError + async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: + raise NotImplementedError + @abstractmethod - async def health(self) -> t.Any: raise NotImplementedError + async def health(self) -> t.Any: + raise NotImplementedError + @classmethod async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncClient: try: @@ -75,7 +94,9 @@ class AsyncClient: except httpx.RemoteProtocolError: from ._grpc import AsyncGrpcClient return await AsyncGrpcClient.from_url(url, **kwargs) - except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err + except Exception as err: + raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err + @staticmethod async def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None: try: @@ -84,4 +105,5 @@ class AsyncClient: except httpx.RemoteProtocolError: from ._grpc import AsyncGrpcClient await AsyncGrpcClient.wait_until_server_ready(host, port, timeout, **kwargs) - except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err + except Exception as err: + raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py index d13a0c82..985f75d2 100644 --- a/openllm-client/src/openllm_client/benmin/_grpc.py +++ b/openllm-client/src/openllm_client/benmin/_grpc.py @@ -10,23 +10,21 @@ if not is_grpc_available() or not is_grpc_health_available(): raise ImportError( from grpc import aio from google.protobuf import json_format import grpc, grpc_health.v1.health_pb2 as pb_health, grpc_health.v1.health_pb2_grpc as services_health - pb, services = import_generated_stubs("v1") if t.TYPE_CHECKING: from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse - logger = logging.getLogger(__name__) - class ClientCredentials(t.TypedDict): root_certificates: NotRequired[t.Union[bytes, str]] private_key: NotRequired[t.Union[bytes, str]] certificate_chain: NotRequired[t.Union[bytes, str]] - @overload -def dispatch_channel(server_url: str, typ: t.Literal["async"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel: ... +def dispatch_channel(server_url: str, typ: t.Literal["async"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel: + ... @overload -def dispatch_channel(server_url: str, typ: t.Literal["sync"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel: ... +def dispatch_channel(server_url: str, typ: t.Literal["sync"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel: + ... def dispatch_channel(server_url: str, typ: t.Literal["async", "sync"] = "sync", ssl: bool = False, ssl_client_credentials: ClientCredentials | None = None, options: t.Any | None = None, compression: grpc.Compression | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> aio.Channel | grpc.Channel: credentials = None if ssl: @@ -38,25 +36,17 @@ def dispatch_channel(server_url: str, typ: t.Literal["async", "sync"] = "sync", elif typ == "sync" and ssl: return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression) elif typ == "sync": return grpc.insecure_channel(server_url, options=options, compression=compression) else: raise ValueError(f"Unknown type: {typ}") - class GrpcClient(Client): ssl: bool ssl_client_credentials: t.Optional[ClientCredentials] options: t.Any compression: t.Optional[grpc.Compression] - def __init__( - self, - server_url: str, - svc: bentoml.Service, - # gRPC specific options - ssl: bool = False, - options: t.Any | None = None, - compression: grpc.Compression | None = None, - ssl_client_credentials: ClientCredentials | None = None, - **kwargs: t.Any) -> None: + def __init__(self, server_url: str, svc: bentoml.Service, # gRPC specific options + ssl: bool = False, options: t.Any | None = None, compression: grpc.Compression | None = None, ssl_client_credentials: ClientCredentials | None = None, **kwargs: t.Any) -> None: self.ssl, self.ssl_client_credentials, self.options, self.compression = ssl, ssl_client_credentials, options, compression super().__init__(server_url, svc, **kwargs) + @functools.cached_property def inner(self) -> grpc.Channel: if self.ssl: @@ -87,29 +77,33 @@ class GrpcClient(Client): logger.error("Caught RpcError while connecting to %s:%s:\n", host, port) logger.error(err) raise + @classmethod def from_url(cls, url: str, **kwargs: t.Any) -> GrpcClient: - with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest())) + with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: + metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest())) reflection = bentoml.Service(metadata.name) for api in metadata.apis: - try: reflection.apis[api.name] = InferenceAPI[t.Any](None, - bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}), - bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}), - name=api.name, doc=api.docs) - except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e) + try: + reflection.apis[api.name] = InferenceAPI[t.Any](None, bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}), bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}), name=api.name, doc=api.docs) + except Exception as e: + logger.error("Failed to instantiate client for API %s: ", api.name, e) return cls(url, reflection, **kwargs) - def health(self) -> t.Any: return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service="")) + + def health(self) -> t.Any: + return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service="")) + def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}} if _inference_api.multi_input: if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs)) - else: fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data)) + else: + fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data)) api_fn = {v: k for k, v in self.svc.apis.items()} stubs = services.BentoServiceStub(self.inner) proto = stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs) return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content")))) - class AsyncGrpcClient(AsyncClient): ssl: bool ssl_client_credentials: t.Optional[ClientCredentials] @@ -117,19 +111,11 @@ class AsyncGrpcClient(AsyncClient): interceptors: t.Optional[t.Sequence[aio.ClientInterceptor]] compression: t.Optional[grpc.Compression] - def __init__( - self, - server_url: str, - svc: bentoml.Service, - # gRPC specific options - ssl: bool = False, - options: aio.ChannelArgumentType | None = None, - interceptors: t.Sequence[aio.ClientInterceptor] | None = None, - compression: grpc.Compression | None = None, - ssl_client_credentials: ClientCredentials | None = None, - **kwargs: t.Any) -> None: + def __init__(self, server_url: str, svc: bentoml.Service, # gRPC specific options + ssl: bool = False, options: aio.ChannelArgumentType | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None, compression: grpc.Compression | None = None, ssl_client_credentials: ClientCredentials | None = None, **kwargs: t.Any) -> None: self.ssl, self.ssl_client_credentials, self.options, self.interceptors, self.compression = ssl, ssl_client_credentials, options, interceptors, compression super().__init__(server_url, svc, **kwargs) + @functools.cached_property def inner(self) -> aio.Channel: if self.ssl: @@ -160,18 +146,22 @@ class AsyncGrpcClient(AsyncClient): logger.error("Caught RpcError while connecting to %s:%s:\n", host, port) logger.error(err) raise + @classmethod async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncGrpcClient: - async with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None), interceptors=kwargs.get("interceptors", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest())) + async with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None), interceptors=kwargs.get("interceptors", None)) as channel: + metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest())) reflection = bentoml.Service(metadata.name) for api in metadata.apis: - try: reflection.apis[api.name] = InferenceAPI[t.Any](None, - bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}), - bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}), - name=api.name, doc=api.docs) - except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e) + try: + reflection.apis[api.name] = InferenceAPI[t.Any](None, bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}), bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}), name=api.name, doc=api.docs) + except Exception as e: + logger.error("Failed to instantiate client for API %s: ", api.name, e) return cls(url, reflection, **kwargs) - async def health(self) -> t.Any: return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service="")) + + async def health(self) -> t.Any: + return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service="")) + async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}} state = self.inner.get_state(try_to_connect=True) @@ -179,7 +169,8 @@ class AsyncGrpcClient(AsyncClient): if _inference_api.multi_input: if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = await _inference_api.input.to_proto(kwargs) - else: fake_resp = await _inference_api.input.to_proto(data) + else: + fake_resp = await _inference_api.input.to_proto(data) api_fn = {v: k for k, v in self.svc.apis.items()} async with self.inner: stubs = services.BentoServiceStub(self.inner) diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py index 447b2f1b..4426b8af 100644 --- a/openllm-client/src/openllm_client/benmin/_http.py +++ b/openllm-client/src/openllm_client/benmin/_http.py @@ -4,14 +4,13 @@ from bentoml._internal.service.inference_api import InferenceAPI from urllib.parse import urlparse from openllm_client.benmin import Client, AsyncClient from openllm_core.utils import ensure_exec_coro - logger = logging.getLogger(__name__) - class HttpClient(Client): @functools.cached_property def inner(self) -> httpx.Client: if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}") return httpx.Client(base_url=self.server_url) + @staticmethod def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: host = host if "://" in host else "http://" + host @@ -26,12 +25,16 @@ class HttpClient(Client): logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval) time.sleep(check_interval) # Try once more and raise for exception - try: httpx.get(f"{host}:{port}/readyz").raise_for_status() + try: + httpx.get(f"{host}:{port}/readyz").raise_for_status() except httpx.HTTPStatusError as err: logger.error("Failed to wait until server ready: %s:%d", host, port) logger.error(err) raise - def health(self) -> httpx.Response: return self.inner.get("/readyz") + + def health(self) -> httpx.Response: + return self.inner.get("/readyz") + @classmethod def from_url(cls, url: str, **kwargs: t.Any) -> HttpClient: url = url if "://" in url else "http://" + url @@ -47,8 +50,10 @@ class HttpClient(Client): if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") - try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/")) - except Exception as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e) + try: + reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/")) + except Exception as e: + logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e) return cls(url, reflection) def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: @@ -57,7 +62,8 @@ class HttpClient(Client): if _inference_api.multi_input: if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None)) - else: fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None)) + else: + fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None)) # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this. if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None @@ -71,12 +77,12 @@ class HttpClient(Client): # Request.headers sets a _headers variable. We will need to set this value to our fake request object. fake_req._headers = headers return ensure_exec_coro(_inference_api.output.from_http_request(fake_req)) - class AsyncHttpClient(AsyncClient): @functools.cached_property def inner(self) -> httpx.AsyncClient: if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}") return httpx.AsyncClient(base_url=self.server_url) + @staticmethod async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: host = host if "://" in host else "http://" + host @@ -95,14 +101,17 @@ class AsyncHttpClient(AsyncClient): async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess: resp = await sess.get("/readyz") if resp.status_code != 200: raise TimeoutError(f"Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}") - async def health(self) -> httpx.Response: return await self.inner.get("/readyz") + + async def health(self) -> httpx.Response: + return await self.inner.get("/readyz") + @classmethod async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncHttpClient: url = url if "://" in url else "http://" + url async with httpx.AsyncClient(base_url=url) as session: - resp = await session.get("/docs.json") - if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}") - _spec = orjson.loads(await resp.aread()) + resp = await session.get("/docs.json") + if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}") + _spec = orjson.loads(await resp.aread()) reflection = bentoml.Service(_spec["info"]["title"]) @@ -112,16 +121,20 @@ class AsyncHttpClient(AsyncClient): if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") - try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/")) - except ValueError as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e) + try: + reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/")) + except ValueError as e: + logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e) return cls(url, reflection) + async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: # All gRPC kwargs should be popped out. kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")} if _inference_api.multi_input: if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = await _inference_api.input.to_http_response(kwargs, None) - else: fake_resp = await _inference_api.input.to_http_response(data, None) + else: + fake_resp = await _inference_api.input.to_http_response(data, None) # XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this. if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None diff --git a/openllm-client/src/openllm_client/client.py b/openllm-client/src/openllm_client/client.py index 3b7e3400..72c43233 100644 --- a/openllm-client/src/openllm_client/client.py +++ b/openllm-client/src/openllm_client/client.py @@ -2,31 +2,25 @@ from __future__ import annotations import logging from urllib.parse import urlparse from ._base import BaseClient, BaseAsyncClient - logger = logging.getLogger(__name__) - def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None: address = address if "://" in address else "http://" + address parsed = urlparse(address) self._host, *_port = parsed.netloc.split(":") if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443" else: self._port = next(iter(_port)) - class HTTPClient(BaseClient): def __init__(self, address: str, timeout: int = 30): process_http_address(self, address) super().__init__(address, timeout) - class AsyncHTTPClient(BaseAsyncClient): def __init__(self, address: str, timeout: int = 30): process_http_address(self, address) super().__init__(address, timeout) - class GrpcClient(BaseClient): def __init__(self, address: str, timeout: int = 30): self._host, self._port = address.split(":") super().__init__(address, timeout) - class AsyncGrpcClient(BaseAsyncClient): def __init__(self, address: str, timeout: int = 30): self._host, self._port = address.split(":") diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 3553a3fe..867ad829 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -8,24 +8,24 @@ For example, the following config class: ```python class FlanT5Config(openllm.LLMConfig): - __config__ = { - "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", - "default_id": "google/flan-t5-large", - "model_ids": [ - "google/flan-t5-small", - "google/flan-t5-base", - "google/flan-t5-large", - "google/flan-t5-xl", - "google/flan-t5-xxl", - ], - } + __config__ = { + "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", + "default_id": "google/flan-t5-large", + "model_ids": [ + "google/flan-t5-small", + "google/flan-t5-base", + "google/flan-t5-large", + "google/flan-t5-xl", + "google/flan-t5-xxl", + ], + } - class GenerationConfig: - temperature: float = 0.9 - max_new_tokens: int = 2048 - top_k: int = 50 - top_p: float = 0.4 - repetition_penalty = 1.0 + class GenerationConfig: + temperature: float = 0.9 + max_new_tokens: int = 2048 + top_k: int = 50 + top_p: float = 0.4 + repetition_penalty = 1.0 ``` which generates the environment OPENLLM_FLAN_T5_GENERATION_TEMPERATURE for users to configure temperature @@ -40,18 +40,7 @@ from deepmerge.merger import Merger from ._strategies import LiteralResourceSpec, available_resource_spec, resource_spec from ._typing_compat import LiteralString, NotRequired, Required, overload, AdapterType, LiteralRuntime from .exceptions import ForbiddenAttributeError -from .utils import ( - ENV_VARS_TRUE_VALUES, - MYPY, - ReprMixin, - bentoml_cattr, - codegen, - dantic, - field_env_key, - first_not_none, - lenient_issubclass, - LazyLoader -) +from .utils import ENV_VARS_TRUE_VALUES, MYPY, ReprMixin, bentoml_cattr, codegen, dantic, field_env_key, first_not_none, lenient_issubclass, LazyLoader from .utils.import_utils import BACKENDS_MAPPING # NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm_core.LLMConfig to become 'attrs'-ish from attr._compat import set_closure_cell @@ -71,13 +60,11 @@ __all__ = ["LLMConfig", "GenerationConfig", "SamplingParams", "field_env_key"] logger = logging.getLogger(__name__) config_merger = Merger([(dict, "merge")], ["override"], ["override"]) - # case insensitive, but rename to conform with type class _PeftEnumMeta(enum.EnumMeta): def __getitem__(self, __key: str | t.Any, /) -> t.Any: if isinstance(__key, str): __key = inflection.underscore(__key).upper() return self._member_map_[__key] - # vendorred from peft.utils.config.PeftType since we don't have hard dependency on peft # see https://github.com/huggingface/peft/blob/main/src/peft/utils/config.py class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta): @@ -97,21 +84,23 @@ class PeftType(str, enum.Enum, metaclass=_PeftEnumMeta): return None @classmethod - def supported(cls) -> set[str]: return {inflection.underscore(v.value) for v in cls} - def to_str(self) -> str: return self.value - @staticmethod - def get(__key: str | t.Any, /) -> PeftType: return PeftType[__key] # type-safe getitem. + def supported(cls) -> set[str]: + return {inflection.underscore(v.value) for v in cls} + def to_str(self) -> str: + return self.value + + @staticmethod + def get(__key: str | t.Any, /) -> PeftType: + return PeftType[__key] # type-safe getitem. _PEFT_TASK_TYPE_TARGET_MAPPING = {"causal_lm": "CAUSAL_LM", "seq2seq_lm": "SEQ_2_SEQ_LM"} _object_setattr = object.__setattr__ - def _adapter_converter(value: AdapterType | str | PeftType | None) -> PeftType: if value is None: raise ValueError("'AdapterType' cannot be None.") if isinstance(value, PeftType): return value if value not in PeftType.supported(): raise ValueError(f"Given '{value}' is not a supported adapter type.") return PeftType.get(value) - @attr.define(slots=True, init=True) class FineTuneConfig: """FineTuneConfig defines a default value for fine-tuning this any given LLM. @@ -141,11 +130,16 @@ class FineTuneConfig: if t.TYPE_CHECKING and not MYPY: # The following type stubs makes __init__ aware of attrs internal type converter. @overload - def __init__(self, adapter_type: AdapterType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None: ... + def __init__(self, adapter_type: AdapterType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None: + ... + @overload - def __init__(self, adapter_type: PeftType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None: ... + def __init__(self, adapter_type: PeftType = ..., adapter_config: dict[str, t.Any] = ..., inference_mode: bool = ..., llm_config_class: type[LLMConfig] = ...) -> None: + ... + # The below should be generated via attrs. Only here to conform with pyright strict checking. - def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: ... + def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: + ... adapter_type: PeftType = dantic.Field("lora", description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", use_default_converter=False, converter=_adapter_converter) adapter_config: t.Dict[str, t.Any] = dantic.Field(None, description="The configuration for the adapter. The content of the dict depends on the adapter type.", validator=attr.validators.optional(attr.validators.instance_of(dict)), converter=attr.converters.default_if_none(factory=dict), use_default_converter=False) @@ -172,7 +166,6 @@ class FineTuneConfig: adapter_type, inference_mode = attrs.pop("adapter_type", self.adapter_type), attrs.get("inference_mode", self.inference_mode) if "llm_config_class" in attrs: raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.") return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs)) - @attr.frozen(slots=True, repr=False, init=False) class GenerationConfig(ReprMixin): """GenerationConfig is the attrs-compatible version of ``transformers.GenerationConfig``, with some additional validation and environment constructor. @@ -195,7 +188,10 @@ class GenerationConfig(ReprMixin): top_p: float = dantic.Field(1.0, description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.") typical_p: float = dantic.Field(1.0, description="Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation. See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.") epsilon_cutoff: float = dantic.Field(0.0, description="If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details.") - eta_cutoff: float = dantic.Field(0.0, description="""Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. """) + eta_cutoff: float = dantic.Field( + 0.0, + description="""Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. """ + ) diversity_penalty: float = dantic.Field(0.0, description="This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. ") repetition_penalty: float = dantic.Field(1.0, description="The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.") encoder_repetition_penalty: float = dantic.Field(1.0, description="The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.") @@ -224,18 +220,21 @@ class GenerationConfig(ReprMixin): if t.TYPE_CHECKING and not MYPY: # stubs this for pyright as mypy already has a attr plugin builtin - def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: ... + def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: + ... + def __init__(self, *, _internal: bool = False, **attrs: t.Any): if not _internal: raise RuntimeError("GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config") self.__attrs_init__(**attrs) + def __getitem__(self, item: str) -> t.Any: if hasattr(self, item): return getattr(self, item) raise KeyError(f"'{self.__class__.__name__}' has no attribute {item}.") + @property - def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)} - + def __repr_keys__(self) -> set[str]: + return {i.name for i in attr.fields(self.__class__)} bentoml_cattr.register_unstructure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)})) - @attr.frozen(slots=True, repr=False, init=False) class SamplingParams(ReprMixin): """SamplingParams is the attr-compatible version of ``vllm.SamplingParams``. It provides some utilities to also respect shared variables from ``openllm.LLMConfig``. @@ -261,7 +260,8 @@ class SamplingParams(ReprMixin): top_k: int top_p: float - def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: ... + def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: + ... def __init__(self, *, _internal: bool = False, **attrs: t.Any): if not _internal: raise RuntimeError("SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'") @@ -274,10 +274,14 @@ class SamplingParams(ReprMixin): def __getitem__(self, item: str) -> t.Any: if hasattr(self, item): return getattr(self, item) raise KeyError(f"'{self.__class__.__name__}' has no attribute {item}.") - @property - def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)} - def to_vllm(self) -> vllm.SamplingParams: return vllm.SamplingParams(max_tokens=self.max_tokens, temperature=self.temperature, top_k=self.top_k, top_p=self.top_p, **bentoml_cattr.unstructure(self)) + @property + def __repr_keys__(self) -> set[str]: + return {i.name for i in attr.fields(self.__class__)} + + def to_vllm(self) -> vllm.SamplingParams: + return vllm.SamplingParams(max_tokens=self.max_tokens, temperature=self.temperature, top_k=self.top_k, top_p=self.top_p, **bentoml_cattr.unstructure(self)) + @classmethod def from_generation_config(cls, generation_config: GenerationConfig, **attrs: t.Any) -> Self: """The main entrypoint for creating a SamplingParams from ``openllm.LLMConfig``.""" @@ -290,13 +294,11 @@ class SamplingParams(ReprMixin): top_p = first_not_none(attrs.pop("top_p", None), default=generation_config["top_p"]) max_tokens = first_not_none(attrs.pop("max_tokens", None), attrs.pop("max_new_tokens", None), default=generation_config["max_new_tokens"]) return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs) - bentoml_cattr.register_unstructure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)})) bentoml_cattr.register_structure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename="max_tokens"))) # cached it here to save one lookup per assignment _object_getattribute = object.__getattribute__ - class ModelSettings(t.TypedDict, total=False): """ModelSettings serve only for typing purposes as this is transcribed into LLMConfig.__config__. @@ -341,9 +343,7 @@ class ModelSettings(t.TypedDict, total=False): # tokenizer_class is the custom tokenizer class for this given LLM tokenizer_class: t.Optional[str] - _transformed_type: DictStrAny = {"fine_tune_strategies": t.Dict[AdapterType, FineTuneConfig], "default_implementation": t.Dict[LiteralResourceSpec, LiteralRuntime]} - @attr.define(frozen=False, slots=True, field_transformer=lambda _, __: [attr.Attribute.from_counting_attr(k, dantic.Field(kw_only=False if t.get_origin(ann) is not Required else True, auto_default=True, use_default_converter=False, type=_transformed_type.get(k, ann), metadata={"target": f"__openllm_{k}__"}, description=f"ModelSettings field for {k}.")) for k, ann in t.get_type_hints(ModelSettings).items()]) class _ModelSettingsAttr: """Internal attrs representation of ModelSettings.""" @@ -353,7 +353,8 @@ class _ModelSettingsAttr: raise KeyError(key) @classmethod - def default(cls) -> _ModelSettingsAttr: return cls(**t.cast(DictStrAny, ModelSettings(default_id="__default__", model_ids=["__default__"], architecture="PreTrainedModel", default_implementation={"cpu": "pt", "nvidia.com/gpu": "pt"}, name_type="dasherize", requires_gpu=False, url="", model_type="causal_lm", trust_remote_code=False, requirements=None, tokenizer_class=None, timeout=int(36e6), service_name="", workers_per_resource=1., runtime="transformers"))) + def default(cls) -> _ModelSettingsAttr: + return cls(**t.cast(DictStrAny, ModelSettings(default_id="__default__", model_ids=["__default__"], architecture="PreTrainedModel", default_implementation={"cpu": "pt", "nvidia.com/gpu": "pt"}, name_type="dasherize", requires_gpu=False, url="", model_type="causal_lm", trust_remote_code=False, requirements=None, tokenizer_class=None, timeout=int(36e6), service_name="", workers_per_resource=1., runtime="transformers"))) # NOTE: The below are dynamically generated by the field_transformer if t.TYPE_CHECKING: @@ -379,7 +380,6 @@ class _ModelSettingsAttr: fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig] tokenizer_class: t.Optional[str] # update-config-stubs.py: attrs stop - # a heuristic cascading implementation resolver based on available resources def get_default_implementation(default_implementation_mapping: dict[LiteralResourceSpec, LiteralRuntime]) -> LiteralRuntime: available_spec = available_resource_spec() @@ -387,7 +387,6 @@ def get_default_implementation(default_implementation_mapping: dict[LiteralResou elif resource_spec("amd") in available_spec: return default_implementation_mapping.get(resource_spec("amd"), "pt") elif resource_spec("nvidia") in available_spec: return default_implementation_mapping.get(resource_spec("nvidia"), "pt") else: return default_implementation_mapping.get(resource_spec("cpu"), "pt") - def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr: if "generation_class" in cl_.__config__: raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") @@ -432,11 +431,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ _converted[_adapter_type] = FineTuneConfig(PeftType[_adapter_type], _possible_ft_config, False, _llm_config_class) _final_value_dct["fine_tune_strategies"] = _converted return attr.evolve(_settings_attr, **_final_value_dct) - bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings) - -def _setattr_class(attr_name: str, value_var: t.Any) -> str: return f"setattr(cls, '{attr_name}', {value_var})" - +def _setattr_class(attr_name: str, value_var: t.Any) -> str: + return f"setattr(cls, '{attr_name}', {value_var})" def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: LiteralString = "openllm") -> t.Callable[..., None]: """Generate the assignment script with prefix attributes __openllm___.""" args: ListStr = [] @@ -451,13 +448,13 @@ def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance annotations[attr_name] = field.type return codegen.generate_function(cls, "__assign_attr", lines, args=("cls", *args), globs=globs, annotations=annotations) - _reserved_namespace = {"__config__", "GenerationConfig", "SamplingParams"} - @attr.define(slots=True) class _ConfigAttr: @staticmethod - def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: return dantic.Field(default, **attrs) + def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: + return dantic.Field(default, **attrs) + """Field is a alias to the internal dantic utilities to easily create attrs.fields with pydantic-compatible interface. For example: @@ -604,7 +601,6 @@ class _ConfigAttr: __openllm_tokenizer_class__: t.Optional[str] = Field(None) """Optional tokenizer class for this given LLM. See Llama for example.""" # update-config-stubs.py: special stop - class _ConfigBuilder: """A modified version of attrs internal _ClassBuilder, and should only be called within __init_subclass__ of LLMConfig. @@ -673,7 +669,8 @@ class _ConfigBuilder: # and since we use the _ConfigBuilder in __init_subclass__, it will # raise recusion error. See https://peps.python.org/pep-0487/ for more # information on how __init_subclass__ works. - for k, value in cd.items(): setattr(self._cls, k, value) + for k, value in cd.items(): + setattr(self._cls, k, value) return self.make_closure(self._cls) def make_closure(self, cls: type[t.Any]) -> type[t.Any]: @@ -684,18 +681,23 @@ class _ConfigBuilder: # as `method.__closure__`. Since we replace the class with a # clone, we rewrite these references so it keeps working. for item in cls.__dict__.values(): - # Class- and staticmethods hide their functions inside. - # These might need to be rewritten as well. - if isinstance(item, (classmethod, staticmethod)): closure_cells = getattr(item.__func__, "__closure__", None) - # Workaround for property `super()` shortcut (PY3-only). - # There is no universal way for other descriptors. - elif isinstance(item, property): closure_cells = getattr(item.fget, "__closure__", None) - else: closure_cells = getattr(item, "__closure__", None) + if isinstance(item, (classmethod, staticmethod)): + # Class- and staticmethods hide their functions inside. + # These might need to be rewritten as well. + closure_cells = getattr(item.__func__, "__closure__", None) + elif isinstance(item, property): + # Workaround for property `super()` shortcut (PY3-only). + # There is no universal way for other descriptors. + closure_cells = getattr(item.fget, "__closure__", None) + else: + closure_cells = getattr(item, "__closure__", None) if not closure_cells: continue # Catch None or the empty list. for cell in closure_cells: - try: match = cell.cell_contents is self._cls - except ValueError: pass # ValueError: Cell is empty + try: + match = cell.cell_contents is self._cls + except ValueError: + pass # ValueError: Cell is empty else: if match: set_closure_cell(cell, cls) return cls @@ -703,12 +705,12 @@ class _ConfigBuilder: def add_attrs_init(self) -> Self: self._cls_dict["__attrs_init__"] = codegen.add_method_dunders(self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True)) return self + def add_repr(self) -> Self: for key, fn in ReprMixin.__dict__.items(): if key in ("__repr__", "__str__", "__repr_name__", "__repr_str__", "__repr_args__"): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn) self._cls_dict["__repr_keys__"] = property(lambda _: {i.name for i in self._attrs} | {"generation_config", "sampling_config"}) return self - @attr.define(slots=True, init=False) class LLMConfig(_ConfigAttr): """``openllm.LLMConfig`` is a pydantic-like ``attrs`` interface that offers fast and easy-to-use APIs. @@ -816,14 +818,18 @@ class LLMConfig(_ConfigAttr): # auto assignment attributes generated from __config__ after create the new slot class. _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls) + def _make_subclass(class_attr: str, base: type[At], globs: dict[str, t.Any] | None = None, suffix_env: LiteralString | None = None) -> type[At]: camel_name = cls.__name__.replace("Config", "") klass = attr.make_class(f"{camel_name}{class_attr}", [], bases=(base,), slots=True, weakref_slot=True, frozen=True, repr=False, init=False, collect_by_mro=True, field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__, suffix=suffix_env, globs=globs, default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default) if codegen.has_own_attribute(cls, class_attr) else field_default)) # For pickling to work, the __module__ variable needs to be set to the # frame where the class is created. This respect the module that is created from cls - try: klass.__module__ = cls.__module__ - except (AttributeError, ValueError): pass + try: + klass.__module__ = cls.__module__ + except (AttributeError, ValueError): + pass return t.cast("type[At]", klass) + cls.__openllm_generation_class__ = _make_subclass("GenerationConfig", GenerationConfig, suffix_env="generation") cls.__openllm_sampling_class__ = _make_subclass("SamplingParams", SamplingParams, suffix_env="sampling") @@ -864,8 +870,10 @@ class LLMConfig(_ConfigAttr): cls.__openllm_hints__ = {f.name: f.type for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__), attr.fields(cls.__openllm_sampling_class__),] for f in ite} # for pickling to work, need to set the module to the correct outer frame - try: cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__") - except (AttributeError, ValueError): pass + try: + cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__") + except (AttributeError, ValueError): + pass def __setattr__(self, attr: str, value: t.Any) -> None: if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.") @@ -889,6 +897,7 @@ class LLMConfig(_ConfigAttr): # The rest of attrs should only be the attributes to be passed to __attrs_init__ self.__attrs_init__(**attrs) + # fmt: off # update-config-stubs.py: start # NOTE: ModelSettings arguments @overload @@ -1052,6 +1061,7 @@ class LLMConfig(_ConfigAttr): @overload def __getitem__(self, item: t.Literal["ia3"]) -> dict[str, t.Any]: ... # update-config-stubs.py: stop + # fmt: on def __getitem__(self, item: LiteralString | t.Any) -> t.Any: """Allowing access LLMConfig as a dictionary. The order will always evaluate as. @@ -1075,11 +1085,22 @@ class LLMConfig(_ConfigAttr): def __getattribute__(self, item: str) -> t.Any: if item in _reserved_namespace: raise ForbiddenAttributeError(f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.") return _object_getattribute.__get__(self)(item) - def __len__(self) -> int: return len(self.__openllm_accepted_keys__) + len(self.__openllm_extras__) - def keys(self) -> list[str]: return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__) - def values(self) -> list[t.Any]: return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values())) - def items(self) -> list[tuple[str, t.Any]]: return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items())) - def __iter__(self) -> t.Iterator[str]: return iter(self.keys()) + + def __len__(self) -> int: + return len(self.__openllm_accepted_keys__) + len(self.__openllm_extras__) + + def keys(self) -> list[str]: + return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__) + + def values(self) -> list[t.Any]: + return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values())) + + def items(self) -> list[tuple[str, t.Any]]: + return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items())) + + def __iter__(self) -> t.Iterator[str]: + return iter(self.keys()) + def __contains__(self, item: t.Any) -> bool: if item in self.__openllm_extras__: return True return item in self.__openllm_accepted_keys__ @@ -1114,9 +1135,12 @@ class LLMConfig(_ConfigAttr): # frame where the class is created. Bypass this step in environments where # sys._getframe is not defined (Jython for example) or sys._getframe is not # defined for arguments greater than 0 (IronPython). - try: new_cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__") - except (AttributeError, ValueError): pass + try: + new_cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__") + except (AttributeError, ValueError): + pass return new_cls(**attrs) + def model_dump(self, flatten: bool = False, **_: t.Any) -> DictStrAny: dumped = bentoml_cattr.unstructure(self) generation_config = bentoml_cattr.unstructure(self.generation_config) @@ -1125,13 +1149,18 @@ class LLMConfig(_ConfigAttr): else: dumped["generation_config"] = generation_config dumped.update(sampling_config) return dumped - def model_dump_json(self, **kwargs: t.Any) -> bytes: return orjson.dumps(self.model_dump(**kwargs)) + + def model_dump_json(self, **kwargs: t.Any) -> bytes: + return orjson.dumps(self.model_dump(**kwargs)) @classmethod def model_construct_json(cls, json_str: str | bytes) -> Self: - try: attrs = orjson.loads(json_str) - except orjson.JSONDecodeError as err: raise openllm_core.exceptions.ValidationError(f"Failed to load JSON: {err}") from None + try: + attrs = orjson.loads(json_str) + except orjson.JSONDecodeError as err: + raise openllm_core.exceptions.ValidationError(f"Failed to load JSON: {err}") from None return bentoml_cattr.structure(attrs, cls) + @classmethod def model_construct_env(cls, **attrs: t.Any) -> Self: """A helpers that respect configuration values environment variables.""" @@ -1141,8 +1170,10 @@ class LLMConfig(_ConfigAttr): config_from_env: DictStrAny = {} if env_json_string is not None: - try: config_from_env = orjson.loads(env_json_string) - except orjson.JSONDecodeError as e: raise RuntimeError(f"Failed to parse '{model_config}' as valid JSON string.") from e + try: + config_from_env = orjson.loads(env_json_string) + except orjson.JSONDecodeError as e: + raise RuntimeError(f"Failed to parse '{model_config}' as valid JSON string.") from e if "generation_config" in attrs: generation_config = attrs.pop("generation_config") @@ -1173,13 +1204,20 @@ class LLMConfig(_ConfigAttr): return self.model_construct_env(**llm_config_attrs), {k: v for k, v in attrs.items() if k not in key_to_remove} @overload - def to_generation_config(self, return_as_dict: t.Literal[False] = False) -> transformers.GenerationConfig: ... + def to_generation_config(self, return_as_dict: t.Literal[False] = False) -> transformers.GenerationConfig: + ... + @overload - def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> DictStrAny: ... + def to_generation_config(self, return_as_dict: t.Literal[True] = ...) -> DictStrAny: + ... + def to_generation_config(self, return_as_dict: bool = False) -> transformers.GenerationConfig | DictStrAny: config = transformers.GenerationConfig(**bentoml_cattr.unstructure(self.generation_config)) return config.to_dict() if return_as_dict else config - def to_sampling_config(self) -> vllm.SamplingParams: return self.sampling_config.to_vllm() + + def to_sampling_config(self) -> vllm.SamplingParams: + return self.sampling_config.to_vllm() + @classmethod def to_click_options(cls, f: AnyCallable) -> click.Command: """Convert current configuration to click options. @@ -1217,9 +1255,12 @@ class LLMConfig(_ConfigAttr): # holds a mapping from self.__openllm_model_type__ to peft.TaskType @classmethod - def peft_task_type(cls) -> str: return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__] + def peft_task_type(cls) -> str: + return _PEFT_TASK_TYPE_TARGET_MAPPING[cls.__openllm_model_type__] + @classmethod - def default_implementation(cls) -> LiteralRuntime: return first_not_none(cls.__openllm_env__["framework_value"], default=get_default_implementation(cls.__openllm_default_implementation__)) + def default_implementation(cls) -> LiteralRuntime: + return first_not_none(cls.__openllm_env__["framework_value"], default=get_default_implementation(cls.__openllm_default_implementation__)) def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]: """This handler will sanitize all attrs and setup prompt text. @@ -1245,10 +1286,7 @@ class LLMConfig(_ConfigAttr): `openllm.LLM` also has a postprocess_generate that will just call this method. """ return generation_result - - bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True)) - def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: """Structure a dictionary to a LLMConfig object. @@ -1270,6 +1308,5 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: # The rest should be passed to extras data = {k: v for k, v in data.items() if k not in cls.__openllm_accepted_keys__} return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs) - bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config) openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm"))) diff --git a/openllm-core/src/openllm_core/_prompt.py b/openllm-core/src/openllm_core/_prompt.py index 54c4494e..b8ee17f3 100644 --- a/openllm-core/src/openllm_core/_prompt.py +++ b/openllm-core/src/openllm_core/_prompt.py @@ -5,12 +5,13 @@ class PromptFormatter(string.Formatter): def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> t.Any: if len(args) > 0: raise ValueError("Positional arguments are not supported") return super().vformat(format_string, args, kwargs) + def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None: extras = set(kwargs).difference(used_args) if extras: raise KeyError(f"Extra params passed: {extras}") + def extract_template_variables(self, template: str) -> t.Sequence[str]: return [field[1] for field in self.parse(template) if field[1] is not None] - default_formatter = PromptFormatter() def process_prompt(prompt: str, template: str | None = None, use_prompt_template: bool = True, **attrs: t.Any) -> str: # Currently, all default prompt will always have `instruction` key. @@ -19,5 +20,7 @@ def process_prompt(prompt: str, template: str | None = None, use_prompt_template template_variables = default_formatter.extract_template_variables(template) prompt_variables = {k: v for k, v in attrs.items() if k in template_variables} if "instruction" in prompt_variables: raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_prompt_template=True'") - try: return template.format(instruction=prompt, **prompt_variables) - except KeyError as e: raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template.") from None + try: + return template.format(instruction=prompt, **prompt_variables) + except KeyError as e: + raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template.") from None diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py index d8accd08..893150b7 100644 --- a/openllm-core/src/openllm_core/_schema.py +++ b/openllm-core/src/openllm_core/_schema.py @@ -5,33 +5,43 @@ import attr, inflection from openllm_core._configuration import GenerationConfig, LLMConfig from .utils import bentoml_cattr if t.TYPE_CHECKING: import vllm - @attr.frozen(slots=True) class GenerationInput: prompt: str llm_config: LLMConfig adapter_name: str | None = attr.field(default=None) - def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name} + + def model_dump(self) -> dict[str, t.Any]: + return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name} + @staticmethod def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig: if isinstance(data, LLMConfig): return data else: if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.") return cls(**data) + @classmethod def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: import openllm return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs)) + @classmethod - def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)}) + def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: + return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)}) @attr.frozen(slots=True) class GenerationOutput: responses: t.List[t.Any] configuration: t.Dict[str, t.Any] + @property - def marshaled_config(self) -> GenerationConfig: return bentoml_cattr.structure(self.configuration, GenerationConfig) + def marshaled_config(self) -> GenerationConfig: + return bentoml_cattr.structure(self.configuration, GenerationConfig) + @property - def unmarshaled(self) -> dict[str, t.Any]: return bentoml_cattr.unstructure(self) + def unmarshaled(self) -> dict[str, t.Any]: + return bentoml_cattr.unstructure(self) + def __getitem__(self, key: str) -> t.Any: if hasattr(self, key): return getattr(self, key) elif key in self.configuration: return self.configuration[key] @@ -49,7 +59,8 @@ class MetadataOutput: class EmbeddingsOutput: embeddings: t.List[t.List[float]] num_tokens: int -def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.Any]: return dict(request_id=request_output.request_id, prompt=request_output.prompt, finished=request_output.finished, prompt_token_ids=request_output.prompt_token_ids, outputs=[dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason) for it in request_output.outputs]) +def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.Any]: + return dict(request_id=request_output.request_id, prompt=request_output.prompt, finished=request_output.finished, prompt_token_ids=request_output.prompt_token_ids, outputs=[dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason) for it in request_output.outputs]) @attr.define class HfAgentInput: inputs: str diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py index df2a44f1..78449934 100644 --- a/openllm-core/src/openllm_core/_strategies.py +++ b/openllm-core/src/openllm_core/_strategies.py @@ -5,12 +5,12 @@ from bentoml._internal.resource import get_resource, system_resources from bentoml._internal.runner.strategy import THREAD_ENVS from .utils import DEBUG, ReprMixin from ._typing_compat import overload - class DynResource(t.Protocol): resource_id: t.ClassVar[str] - @classmethod - def from_system(cls) -> t.Sequence[t.Any]: ... + @classmethod + def from_system(cls) -> t.Sequence[t.Any]: + ... logger = logging.getLogger(__name__) def _strtoul(s: str) -> int: """Return -1 or positive integer sequence string starts with,.""" @@ -21,7 +21,6 @@ def _strtoul(s: str) -> int: if idx + 1 == len(s): idx += 1 # noqa: PLW2901 # NOTE: idx will be set via enumerate return int(s[:idx]) if idx > 0 else -1 - def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: rcs: list[str] = [] for elem in lst.split(","): @@ -31,16 +30,16 @@ def _parse_list_with_prefix(lst: str, prefix: str) -> list[str]: if not elem.startswith(prefix): break rcs.append(elem) return rcs - _STACK_LEVEL = 3 - - @overload # variant: default callback -def _parse_visible_devices() -> list[str] | None: ... +def _parse_visible_devices() -> list[str] | None: + ... @overload # variant: specify None, and respect_env -def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: ... +def _parse_visible_devices(default_var: None, *, respect_env: t.Literal[True]) -> list[str] | None: + ... @overload # variant: default var is something other than None -def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: ... +def _parse_visible_devices(default_var: str = ..., *, respect_env: t.Literal[False]) -> list[str]: + ... def _parse_visible_devices(default_var: str | None = None, respect_env: bool = True) -> list[str] | None: """CUDA_VISIBLE_DEVICES aware with default var for parsing spec.""" if respect_env: @@ -64,7 +63,6 @@ def _parse_visible_devices(default_var: str | None = None, respect_env: bool = T if x < 0: break rc.append(x) return [str(i) for i in rc] - def _from_system(cls: type[DynResource]) -> list[str]: visible_devices = _parse_visible_devices() if visible_devices is None: @@ -100,13 +98,15 @@ def _from_system(cls: type[DynResource]) -> list[str]: except (ImportError, RuntimeError, AttributeError): return [] return visible_devices - @overload -def _from_spec(cls: type[DynResource], spec: int) -> list[str]: ... +def _from_spec(cls: type[DynResource], spec: int) -> list[str]: + ... @overload -def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: ... +def _from_spec(cls: type[DynResource], spec: list[int | str]) -> list[str]: + ... @overload -def _from_spec(cls: type[DynResource], spec: str) -> list[str]: ... +def _from_spec(cls: type[DynResource], spec: str) -> list[str]: + ... def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: if isinstance(spec, int): if spec in (-1, 0): return [] @@ -116,13 +116,15 @@ def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: if not spec: return [] if spec.isdigit(): spec = ",".join([str(i) for i in range(_strtoul(spec))]) return _parse_visible_devices(spec, respect_env=False) - elif isinstance(spec, list): return [str(x) for x in spec] - else: raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.") - + elif isinstance(spec, list): + return [str(x) for x in spec] + else: + raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.") def _raw_device_uuid_nvml() -> list[str] | None: from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer - try: nvml_h = CDLL("libnvidia-ml.so.1") + try: + nvml_h = CDLL("libnvidia-ml.so.1") except Exception: warnings.warn("Failed to find nvidia binding", stacklevel=_STACK_LEVEL) return None @@ -152,7 +154,6 @@ def _raw_device_uuid_nvml() -> list[str] | None: uuids.append(buf.raw.decode("ascii").strip("\0")) del nvml_h return uuids - def _validate(cls: type[DynResource], val: list[t.Any]) -> None: if cls.resource_id == "amd.com/gpu": raise RuntimeError("AMD GPU validation is not yet supported. Make sure to call 'get_resource(..., validate=False)'") @@ -175,9 +176,8 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None: if err != cuda.CUresult.CUDA_SUCCESS: raise ValueError(f"Failed to get device {el}") except (ImportError, RuntimeError): pass - -def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: return types.new_class(name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"})) - +def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: + return types.new_class(name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"})) # NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm: _TPU_RESOURCE: t.Literal["cloud-tpus.google.com/v2"] = "cloud-tpus.google.com/v2" _AMD_GPU_RESOURCE: t.Literal["amd.com/gpu"] = "amd.com/gpu" @@ -194,7 +194,6 @@ AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AM ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""") LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"] - # convenient mapping def resource_spec(name: t.Literal["tpu", "amd", "nvidia", "cpu"]) -> LiteralResourceSpec: if name == "tpu": return _TPU_RESOURCE @@ -202,7 +201,6 @@ def resource_spec(name: t.Literal["tpu", "amd", "nvidia", "cpu"]) -> LiteralReso elif name == "nvidia": return _NVIDIA_GPU_RESOURCE elif name == "cpu": return _CPU_RESOURCE else: raise ValueError("Unknown alias. Accepted: ['tpu', 'amd', 'nvidia', 'cpu']") - @functools.lru_cache def available_resource_spec() -> tuple[LiteralResourceSpec, ...]: """This is a utility function helps to determine the available resources from given running system. @@ -216,7 +214,6 @@ def available_resource_spec() -> tuple[LiteralResourceSpec, ...]: if len(NvidiaGpuResource.from_system()) > 0: available.append(_NVIDIA_GPU_RESOURCE) available.append(_CPU_RESOURCE) return tuple(available) - class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): """This is extends the default BentoML strategy where we check for NVIDIA GPU resource -> AMD GPU resource -> CPU resource. @@ -327,5 +324,4 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): if idx >= len(gpus): raise ValueError(f"Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}") dev = str(gpus[idx]) return dev - -__all__=["CascadingResourceStrategy", "get_resource"] +__all__ = ["CascadingResourceStrategy", "get_resource"] diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index 4d0a7d18..c845202d 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -10,7 +10,6 @@ if t.TYPE_CHECKING: from bentoml._internal.runner.strategy import Strategy from .utils.lazy import VersionInfo - M = t.TypeVar("M", bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]") T = t.TypeVar("T", bound="t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]") @@ -28,39 +27,33 @@ AdapterType = t.Literal["lora", "adalora", "adaption_prompt", "prefix_tuning", " LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"] LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"] -if sys.version_info[:2] >= (3,11): +if sys.version_info[:2] >= (3, 11): from typing import LiteralString as LiteralString, Self as Self, overload as overload from typing import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform else: from typing_extensions import LiteralString as LiteralString, Self as Self, overload as overload from typing_extensions import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform -if sys.version_info[:2] >= (3,10): +if sys.version_info[:2] >= (3, 10): from typing import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate else: from typing_extensions import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate - class PeftAdapterOutput(t.TypedDict): success: bool result: t.Dict[str, peft.PeftConfig] error_msg: str - class LLMEmbeddings(t.TypedDict): embeddings: t.List[t.List[float]] num_tokens: int - class AdaptersTuple(TupleAny): adapter_id: str name: t.Optional[str] config: DictStrAny - AdaptersMapping = t.Dict[AdapterType, t.Tuple[AdaptersTuple, ...]] - class RefTuple(TupleAny): git_hash: str version: VersionInfo strategy: LiteralContainerVersionStrategy - class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): SUPPORTED_RESOURCES = ("amd.com/gpu", "nvidia.com/gpu", "cpu") SUPPORTS_CPU_MULTI_THREADING = True @@ -70,7 +63,6 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]] generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] - class LLMRunner(bentoml.Runner, t.Generic[M, T]): __doc__: str __module__: str @@ -86,17 +78,33 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]] generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] - def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: ... - def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: ... + + def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: + ... + + def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: + ... + @abc.abstractmethod - def embed(self, prompt: str | list[str]) -> LLMEmbeddings: ... - def run(self, prompt: str, **attrs: t.Any) -> t.Any: ... - async def async_run(self, prompt: str, **attrs: t.Any) -> t.Any: ... + def embed(self, prompt: str | list[str]) -> LLMEmbeddings: + ... + + def run(self, prompt: str, **attrs: t.Any) -> t.Any: + ... + + async def async_run(self, prompt: str, **attrs: t.Any) -> t.Any: + ... + @abc.abstractmethod - def download_model(self) -> bentoml.Model: ... + def download_model(self) -> bentoml.Model: + ... + @property @abc.abstractmethod - def peft_adapters(self) -> PeftAdapterOutput: ... + def peft_adapters(self) -> PeftAdapterOutput: + ... + @property @abc.abstractmethod - def __repr_keys__(self) -> set[str]: ... + def __repr_keys__(self) -> set[str]: + ... diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 7c1a9d6c..0c878b69 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -14,12 +14,12 @@ if t.TYPE_CHECKING: # NOTE: This is the entrypoint when adding new model config CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ("llama", "LlamaConfig"), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")]) - class _LazyConfigMapping(OrderedDict, ReprMixin): def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]): self._mapping = mapping self._extra_content: dict[str, t.Any] = {} self._modules: dict[str, types.ModuleType] = {} + def __getitem__(self, key: str) -> t.Any: if key in self._extra_content: return self._extra_content[key] if key not in self._mapping: @@ -30,30 +30,48 @@ class _LazyConfigMapping(OrderedDict, ReprMixin): if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value) # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the object at the top level. return getattr(importlib.import_module("openllm"), value) + @property - def __repr_keys__(self) -> set[str]: return set(self._mapping.keys()) - def __repr__(self) -> str: return ReprMixin.__repr__(self) - def __repr_args__(self) -> t.Generator[tuple[str, t.Any], t.Any, t.Any]: yield from self._mapping.items() - def keys(self) -> ConfigKeysView: return t.cast("ConfigKeysView", list(self._mapping.keys()) + list(self._extra_content.keys())) - def values(self) -> ConfigValuesView: return t.cast("ConfigValuesView", [self[k] for k in self._mapping.keys()] + list(self._extra_content.values())) - def items(self) -> ConfigItemsView: return t.cast("ConfigItemsView", [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items())) - def __iter__(self) -> t.Iterator[str]: return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) - def __contains__(self, item: t.Any) -> bool: return item in self._mapping or item in self._extra_content + def __repr_keys__(self) -> set[str]: + return set(self._mapping.keys()) + + def __repr__(self) -> str: + return ReprMixin.__repr__(self) + + def __repr_args__(self) -> t.Generator[tuple[str, t.Any], t.Any, t.Any]: + yield from self._mapping.items() + + def keys(self) -> ConfigKeysView: + return t.cast("ConfigKeysView", list(self._mapping.keys()) + list(self._extra_content.keys())) + + def values(self) -> ConfigValuesView: + return t.cast("ConfigValuesView", [self[k] for k in self._mapping.keys()] + list(self._extra_content.values())) + + def items(self) -> ConfigItemsView: + return t.cast("ConfigItemsView", [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items())) + + def __iter__(self) -> t.Iterator[str]: + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item: t.Any) -> bool: + return item in self._mapping or item in self._extra_content + def register(self, key: str, value: t.Any) -> None: if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.") self._extra_content[key] = value - CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES) # The below handle special alias when we call underscore to the name directly without processing camelcase first. CONFIG_NAME_ALIASES: dict[str, str] = {"chat_glm": "chatglm", "stable_lm": "stablelm", "star_coder": "starcoder", "gpt_neo_x": "gpt_neox",} - class AutoConfig: - def __init__(self, *_: t.Any, **__: t.Any): raise EnvironmentError("Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.") + def __init__(self, *_: t.Any, **__: t.Any): + raise EnvironmentError("Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.") + @classmethod def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig: model_name = inflection.underscore(model_name) if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs) raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.") + @classmethod def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]: model_name = inflection.underscore(name) diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py index 844128d0..68a85482 100644 --- a/openllm-core/src/openllm_core/config/configuration_baichuan.py +++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py @@ -1,7 +1,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core._prompt import process_prompt - START_BAICHUAN_COMMAND_DOCSTRING = """\ Run a LLMServer for Baichuan model. @@ -22,8 +21,6 @@ or provide `--model-id` flag when running ``openllm start baichuan``: $ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b' """ DEFAULT_PROMPT_TEMPLATE = """{instruction}""" - - class BaichuanConfig(openllm_core.LLMConfig): """Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology. @@ -34,12 +31,15 @@ class BaichuanConfig(openllm_core.LLMConfig): and English benchmarks (C-Eval, MMLU, etc). Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM", - "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]} + __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM", "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]} + class GenerationConfig: max_new_tokens: int = 2048 top_p: float = 0.7 temperature: float = 0.95 + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {} - def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py index 19030a8a..d70d9aba 100644 --- a/openllm-core/src/openllm_core/config/configuration_chatglm.py +++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py @@ -1,7 +1,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core.utils import dantic - START_CHATGLM_COMMAND_DOCSTRING = """\ Run a LLMServer for ChatGLM model. @@ -22,7 +21,6 @@ or provide `--model-id` flag when running ``openllm start chatglm``: $ openllm start chatglm --model-id='thudm/chatglm-6b-int8' """ DEFAULT_PROMPT_TEMPLATE = """{instruction}""" - class ChatGLMConfig(openllm_core.LLMConfig): """ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework. @@ -37,10 +35,10 @@ class ChatGLMConfig(openllm_core.LLMConfig): Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration", - "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]} + __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration", "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]} retain_history: bool = dantic.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.") use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.") + class GenerationConfig: max_new_tokens: int = 2048 num_beams: int = 1 @@ -50,11 +48,14 @@ class ChatGLMConfig(openllm_core.LLMConfig): def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: prompt_text = "" if use_default_prompt_template and chat_history is not None: - for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" + for i, (old_query, response) in enumerate(chat_history): + prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n" prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:" - else: prompt_text = prompt + else: + prompt_text = prompt postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None} return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs + def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str: generated, history = generation_result if self.config.retain_history: diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py index ce047cd0..d7a47b99 100644 --- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py +++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py @@ -54,7 +54,6 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str) token_ids = tokenizer.encode(key) if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}") return token_ids[0] - class DollyV2Config(openllm_core.LLMConfig): """Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. @@ -67,15 +66,18 @@ class DollyV2Config(openllm_core.LLMConfig): Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information. """ - __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM", - "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]} + __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM", "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]} return_full_text: bool = dantic.Field(False, description="Whether to return the full prompt to the users.") + class GenerationConfig: temperature: float = 0.9 top_p: float = 0.92 top_k: int = 5 max_new_tokens: int = 256 eos_token_id: int = 50277 # NOTE: from get_special_token_id(self.tokenizer, END_KEY) + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {} - def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"] + + def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: + return generation_result[0]["generated_text"] diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py index e645f679..e7e1869c 100644 --- a/openllm-core/src/openllm_core/config/configuration_falcon.py +++ b/openllm-core/src/openllm_core/config/configuration_falcon.py @@ -1,7 +1,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core._prompt import process_prompt - START_FALCON_COMMAND_DOCSTRING = """\ Run a LLMServer for FalconLM model. @@ -27,7 +26,6 @@ DEFAULT_PROMPT_TEMPLATE = """{context} {user_name}: {instruction} {agent}: """ - class FalconConfig(openllm_core.LLMConfig): """Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora. @@ -35,9 +33,21 @@ class FalconConfig(openllm_core.LLMConfig): Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM", - "default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"], - "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)} + __config__ = { + "name_type": "lowercase", + "trust_remote_code": True, + "requires_gpu": True, + "timeout": int(36e6), + "url": "https://falconllm.tii.ae/", + "requirements": ["einops", "xformers"], + "architecture": "FalconForCausalLM", + "default_id": "tiiuae/falcon-7b", + "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"], + "fine_tune_strategies": ({ + "adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"] + },) + } + class GenerationConfig: max_new_tokens: int = 200 top_k: int = 10 @@ -47,4 +57,6 @@ class FalconConfig(openllm_core.LLMConfig): def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {} - def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py index 7fffddb7..dbfd8fdc 100644 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py @@ -1,7 +1,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core._prompt import process_prompt - START_FLAN_T5_COMMAND_DOCSTRING = """\ Run a LLMServer for FLAN-T5 model. @@ -28,7 +27,6 @@ or provide `--model-id` flag when running ``openllm start flan-t5``: $ openllm start flan-t5 --model-id google/flan-t5-xxl """ DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:""" - class FlanT5Config(openllm_core.LLMConfig): """FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf). @@ -36,8 +34,8 @@ class FlanT5Config(openllm_core.LLMConfig): Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information. """ - __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm", - "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]} + __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm", "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]} + class GenerationConfig: temperature: float = 0.9 max_new_tokens: int = 2048 @@ -47,4 +45,6 @@ class FlanT5Config(openllm_core.LLMConfig): def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {} - def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py index 891002c5..537ecbc8 100644 --- a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py +++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py @@ -2,7 +2,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core._prompt import process_prompt from openllm_core.utils import dantic - START_GPT_NEOX_COMMAND_DOCSTRING = """\ Run a LLMServer for GPTNeoX model. @@ -23,7 +22,6 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``: $ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b' """ DEFAULT_PROMPT_TEMPLATE = """{instruction}""" - class GPTNeoXConfig(openllm_core.LLMConfig): """GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license. @@ -39,13 +37,15 @@ class GPTNeoXConfig(openllm_core.LLMConfig): Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox) for more information. """ - __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox", - "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]} + __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox", "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]} use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.") class GenerationConfig: temperature: float = 0.9 max_new_tokens: int = 100 + def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {} - def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index bb6b9b29..b0ebc164 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -2,7 +2,6 @@ from __future__ import annotations import typing as t, openllm_core from openllm_core._prompt import process_prompt from openllm_core.utils import dantic - START_LLAMA_COMMAND_DOCSTRING = """\ Run a LLMServer for Llama model. @@ -41,9 +40,9 @@ SINST_KEY, EINST_KEY, SYS_KEY, EOS_TOKEN, BOS_TOKEN = "[INST]", "[/INST]", "< str: return PROMPT_MAPPING[model_type] +def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: + return PROMPT_MAPPING[model_type] DEFAULT_PROMPT_TEMPLATE = _get_prompt - class LlamaConfig(openllm_core.LLMConfig): """LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. @@ -57,19 +56,54 @@ class LlamaConfig(openllm_core.LLMConfig): for more information. """ use_llama2_prompt: bool = dantic.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.") - __config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast", - "default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf", - "meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf", - "openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"], - "fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)} + __config__ = { + "name_type": "lowercase", + "url": "https://github.com/facebookresearch/llama", + "default_implementation": { + "cpu": "pt", "nvidia.com/gpu": "pt" + }, + "architecture": "LlamaForCausalLM", + "requirements": ["fairscale", "sentencepiece"], + "tokenizer_class": "LlamaTokenizerFast", + "default_id": "NousResearch/llama-2-7b-hf", + "model_ids": [ + "meta-llama/Llama-2-70b-chat-hf", + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-70b-hf", + "meta-llama/Llama-2-13b-hf", + "meta-llama/Llama-2-7b-hf", + "NousResearch/llama-2-70b-chat-hf", + "NousResearch/llama-2-13b-chat-hf", + "NousResearch/llama-2-7b-chat-hf", + "NousResearch/llama-2-70b-hf", + "NousResearch/llama-2-13b-hf", + "NousResearch/llama-2-7b-hf", + "openlm-research/open_llama_7b_v2", + "openlm-research/open_llama_3b_v2", + "openlm-research/open_llama_13b", + "huggyllama/llama-65b", + "huggyllama/llama-30b", + "huggyllama/llama-13b", + "huggyllama/llama-7b" + ], + "fine_tune_strategies": ({ + "adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none" + },) + } + class GenerationConfig: max_new_tokens: int = 128 temperature: float = 0.6 top_p: float = 0.9 top_k: int = 12 + class SamplingParams: best_of: int = 1 presence_penalty: float = 0.5 + def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {} - def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py index dde1dbb9..3473729d 100644 --- a/openllm-core/src/openllm_core/config/configuration_mpt.py +++ b/openllm-core/src/openllm_core/config/configuration_mpt.py @@ -2,7 +2,6 @@ from __future__ import annotations import typing as t, openllm_core from openllm_core.utils import dantic from openllm_core._prompt import process_prompt - MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"] START_MPT_COMMAND_DOCSTRING = """\ @@ -42,9 +41,9 @@ _chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instr {response_key} """.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction="{instruction}", response_key=RESPONSE_KEY) PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt} -def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type] +def _get_prompt(model_type: str) -> str: + return PROMPT_MAPPING[model_type] DEFAULT_PROMPT_TEMPLATE = _get_prompt - class MPTConfig(openllm_core.LLMConfig): """MPT is a decoder-style transformer pretrained from scratch on English text and code. @@ -54,14 +53,15 @@ class MPTConfig(openllm_core.LLMConfig): on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml) for more details on specific models. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM", - "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]} + __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM", "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]} prompt_type: MPTPromptType = dantic.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.") max_sequence_length: int = dantic.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)") + class GenerationConfig: max_new_tokens: int = 128 temperature: float = 0 top_p: float = 0.8 + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: _template = None if use_default_prompt_template: @@ -72,4 +72,6 @@ class MPTConfig(openllm_core.LLMConfig): else: prompt_type = "default" _template = DEFAULT_PROMPT_TEMPLATE(prompt_type) return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {} - def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py index c4fdd028..1731c944 100644 --- a/openllm-core/src/openllm_core/config/configuration_opt.py +++ b/openllm-core/src/openllm_core/config/configuration_opt.py @@ -2,7 +2,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core.utils import dantic from openllm_core._prompt import process_prompt - START_OPT_COMMAND_DOCSTRING = """\ Run a LLMServer for OPT model. @@ -29,7 +28,6 @@ or provide `--model-id` flag when running ``openllm start opt``: $ openllm start opt --model-id facebook/opt-6.7b """ DEFAULT_PROMPT_TEMPLATE = """{instruction}""" - class OPTConfig(openllm_core.LLMConfig): """OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI. @@ -40,18 +38,18 @@ class OPTConfig(openllm_core.LLMConfig): Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information. """ - __config__ = { - "name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt", - "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"], - "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},) - } + __config__ = {"name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt", "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"], "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)} format_outputs: bool = dantic.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""") + class GenerationConfig: top_k: int = 15 temperature: float = 0.75 max_new_tokens: int = 1024 num_return_sequences: int = 1 - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {} + + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {} + def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: if len(generation_result) == 1: return generation_result[0] if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result) diff --git a/openllm-core/src/openllm_core/config/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py index 9498f2dc..13478a4d 100644 --- a/openllm-core/src/openllm_core/config/configuration_stablelm.py +++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py @@ -1,7 +1,6 @@ from __future__ import annotations import openllm_core, typing as t from openllm_core._prompt import process_prompt - START_STABLELM_COMMAND_DOCSTRING = """\ Run a LLMServer for StableLM model. @@ -28,7 +27,6 @@ SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version) - StableLM will refuse to participate in anything that could harm a human. """ DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>""" - class StableLMConfig(openllm_core.LLMConfig): """StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models. @@ -43,17 +41,21 @@ class StableLMConfig(openllm_core.LLMConfig): and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) for more information. """ - __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM", - "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]} + __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM", "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]} + class GenerationConfig: temperature: float = 0.9 max_new_tokens: int = 128 top_k: int = 0 top_p: float = 0.9 + def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: if "tuned" in self._model_id and use_default_prompt_template: system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT) prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs) - else: prompt_text = prompt + else: + prompt_text = prompt return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {} - def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py index a0bc5e68..86c19580 100644 --- a/openllm-core/src/openllm_core/config/configuration_starcoder.py +++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py @@ -1,6 +1,5 @@ from __future__ import annotations import openllm_core, typing as t - START_STARCODER_COMMAND_DOCSTRING = """\ Run a LLMServer for StarCoder model. @@ -22,7 +21,6 @@ $ openllm start starcoder --model-id 'bigcode/starcoder' """ DEFAULT_PROMPT_TEMPLATE = """{instruction}""" FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "", "", "", "", "<|endoftext|>", "" - class StarCoderConfig(openllm_core.LLMConfig): """The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded. @@ -32,8 +30,8 @@ class StarCoderConfig(openllm_core.LLMConfig): Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information. """ - __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5, - "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]} + __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5, "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]} + class GenerationConfig: temperature: float = 0.2 max_new_tokens: int = 256 @@ -42,13 +40,19 @@ class StarCoderConfig(openllm_core.LLMConfig): top_p: float = 0.95 pad_token_id: int = 49152 repetition_penalty: float = 1.2 + def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None if fim_mode: - try: prefix, suffix = prompt.split(FIM_INDICATOR) - except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err + try: + prefix, suffix = prompt.split(FIM_INDICATOR) + except Exception as err: + raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}" - else: prompt_text = prompt + else: + prompt_text = prompt # XXX: This value for pad_token_id is currently a hack, need more investigate why the default starcoder doesn't include the same value as santacoder EOD return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {} - def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] + + def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: + return generation_result[0] diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 46995fd0..b7904a25 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -6,64 +6,42 @@ from __future__ import annotations import contextlib, functools, hashlib, logging, logging.config, os, sys, types, typing as t, openllm_core, asyncio from pathlib import Path from circus.exc import ConflictError -from bentoml._internal.configuration import ( - DEBUG_ENV_VAR as DEBUG_ENV_VAR, - GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR, - QUIET_ENV_VAR as QUIET_ENV_VAR, - get_debug_mode as _get_debug_mode, - get_quiet_mode as _get_quiet_mode, - set_quiet_mode as set_quiet_mode, -) +from bentoml._internal.configuration import (DEBUG_ENV_VAR as DEBUG_ENV_VAR, GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR, QUIET_ENV_VAR as QUIET_ENV_VAR, get_debug_mode as _get_debug_mode, get_quiet_mode as _get_quiet_mode, set_quiet_mode as set_quiet_mode,) from bentoml._internal.models.model import ModelContext as _ModelContext from bentoml._internal.types import LazyType as LazyType -from bentoml._internal.utils import ( - LazyLoader as LazyLoader, - bentoml_cattr as bentoml_cattr, - calc_dir_size as calc_dir_size, - first_not_none as first_not_none, - pkg as pkg, - reserve_free_port as reserve_free_port, - resolve_user_filepath as resolve_user_filepath, -) -from openllm_core.utils.lazy import ( - LazyModule as LazyModule, - VersionInfo as VersionInfo, -) +from bentoml._internal.utils import (LazyLoader as LazyLoader, bentoml_cattr as bentoml_cattr, calc_dir_size as calc_dir_size, first_not_none as first_not_none, pkg as pkg, reserve_free_port as reserve_free_port, resolve_user_filepath as resolve_user_filepath,) +from openllm_core.utils.lazy import (LazyModule as LazyModule, VersionInfo as VersionInfo,) if t.TYPE_CHECKING: from openllm_core._typing_compat import AnyCallable - logger = logging.getLogger(__name__) -try: from typing import GenericAlias as _TypingGenericAlias # type: ignore -except ImportError: _TypingGenericAlias = () # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on) +try: + from typing import GenericAlias as _TypingGenericAlias # type: ignore +except ImportError: + _TypingGenericAlias = () # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on) if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,) else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore # _GenericAlias is the actual GenericAlias implementation DEV_DEBUG_VAR = "OPENLLMDEVDEBUG" - def set_debug_mode(enabled: bool, level: int = 1) -> None: # monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs if enabled: os.environ[DEV_DEBUG_VAR] = str(level) os.environ[DEBUG_ENV_VAR] = str(enabled) os.environ[_GRPC_DEBUG_ENV_VAR] = "DEBUG" if enabled else "ERROR" - def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool: try: return isinstance(cls, type) and issubclass(cls, class_or_tuple) # type: ignore[arg-type] except TypeError: if isinstance(cls, _WithArgsTypes): return False raise - def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any: - loop = asyncio.get_event_loop() - if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result() - else: return loop.run_until_complete(coro) - + loop = asyncio.get_event_loop() + if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result() + else: return loop.run_until_complete(coro) def available_devices() -> tuple[str, ...]: """Return available GPU under system. Currently only supports NVIDIA GPUs.""" from openllm_core._strategies import NvidiaGpuResource return tuple(NvidiaGpuResource.from_system()) - @functools.lru_cache(maxsize=128) def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1") -> str: """Generate a hash from given file's modification time. @@ -76,29 +54,26 @@ def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1" The generated hash. """ return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest() - @functools.lru_cache(maxsize=1) -def device_count() -> int: return len(available_devices()) - +def device_count() -> int: + return len(available_devices()) # equivocal setattr to save one lookup per assignment _object_setattr = object.__setattr__ - def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None: """This makes sure that we don't overwrite any existing attributes on the object.""" _setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj) if not hasattr(obj, name): _setattr(name, value) - -def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key]))) - +def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: + return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key]))) # Special debug flag controled via OPENLLMDEVDEBUG DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR))) # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins MYPY = False SHOW_CODEGEN: bool = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3 - -def get_debug_mode() -> bool: return DEBUG or _get_debug_mode() -def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode() - +def get_debug_mode() -> bool: + return DEBUG or _get_debug_mode() +def get_quiet_mode() -> bool: + return not DEBUG and _get_quiet_mode() class ExceptionFilter(logging.Filter): def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any): """A filter of all exception.""" @@ -114,18 +89,37 @@ class ExceptionFilter(logging.Filter): for exc in self.EXCLUDE_EXCEPTIONS: if issubclass(etype, exc): return False return True - class InfoFilter(logging.Filter): - def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING - + def filter(self, record: logging.LogRecord) -> bool: + return logging.INFO <= record.levelno < logging.WARNING _LOGGING_CONFIG: dict[str, t.Any] = { - "version": 1, "disable_existing_loggers": True, - "filters": {"excfilter": {"()": "openllm_core.utils.ExceptionFilter"}, "infofilter": {"()": "openllm_core.utils.InfoFilter"}}, - "handlers": {"bentomlhandler": {"class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout"}, "defaulthandler": {"class": "logging.StreamHandler", "level": logging.WARNING}}, - "loggers": {"bentoml": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}, "openllm": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}}, - "root": {"level": logging.WARNING}, + "version": 1, + "disable_existing_loggers": True, + "filters": { + "excfilter": { + "()": "openllm_core.utils.ExceptionFilter" + }, "infofilter": { + "()": "openllm_core.utils.InfoFilter" + } + }, + "handlers": { + "bentomlhandler": { + "class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout" + }, "defaulthandler": { + "class": "logging.StreamHandler", "level": logging.WARNING + } + }, + "loggers": { + "bentoml": { + "handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False + }, "openllm": { + "handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False + } + }, + "root": { + "level": logging.WARNING + }, } - def configure_logging() -> None: """Configure logging for OpenLLM. @@ -145,7 +139,6 @@ def configure_logging() -> None: _LOGGING_CONFIG["root"]["level"] = logging.INFO logging.config.dictConfig(_LOGGING_CONFIG) - @functools.lru_cache(maxsize=1) def in_notebook() -> bool: try: @@ -153,10 +146,9 @@ def in_notebook() -> bool: if t.TYPE_CHECKING: from IPython.core.interactiveshell import InteractiveShell return "IPKernelApp" in t.cast("dict[str, t.Any]", t.cast(t.Callable[[], "InteractiveShell"], get_ipython)().config) - except (ImportError, AttributeError): return False - + except (ImportError, AttributeError): + return False _dockerenv, _cgroup = Path("/.dockerenv"), Path("/proc/self/cgroup") - class suppress(contextlib.suppress, contextlib.ContextDecorator): """A version of contextlib.suppress with decorator support. @@ -165,7 +157,6 @@ class suppress(contextlib.suppress, contextlib.ContextDecorator): ... {}[''] >>> key_error() """ - def compose(*funcs: AnyCallable) -> AnyCallable: """Compose any number of unary functions into a single unary function. @@ -182,9 +173,10 @@ def compose(*funcs: AnyCallable) -> AnyCallable: >>> [f(3*x, x+1) for x in range(1,10)] [1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7] """ - def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs)) - return functools.reduce(compose_two, funcs) + def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: + return lambda *args, **kwargs: f1(f2(*args, **kwargs)) + return functools.reduce(compose_two, funcs) def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]: """Decorate a function with a transform function that is invoked on results returned from the decorated function. @@ -202,12 +194,10 @@ def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]: ``` """ return lambda func: functools.wraps(func)(compose(transform, func)) - @apply(bool) @suppress(FileNotFoundError) def _text_in_file(text: str, filename: Path) -> bool: return any(text in line for line in filename.open()) - def in_docker() -> bool: """Is this current environment running in docker? @@ -216,16 +206,15 @@ def in_docker() -> bool: ``` """ return _dockerenv.exists() or _text_in_file("docker", _cgroup) - T, K = t.TypeVar("T"), t.TypeVar("K") - def resolve_filepath(path: str, ctx: str | None = None) -> str: """Resolve a file path to an absolute path, expand user and environment variables.""" - try: return resolve_user_filepath(path, ctx) - except FileNotFoundError: return path - -def validate_is_path(maybe_path: str) -> bool: return os.path.exists(os.path.dirname(resolve_filepath(maybe_path))) - + try: + return resolve_user_filepath(path, ctx) + except FileNotFoundError: + return path +def validate_is_path(maybe_path: str) -> bool: + return os.path.exists(os.path.dirname(resolve_filepath(maybe_path))) def generate_context(framework_name: str) -> _ModelContext: framework_versions = {"transformers": pkg.get_pkg_version("transformers")} if openllm_core.utils.is_torch_available(): framework_versions["torch"] = pkg.get_pkg_version("torch") @@ -234,16 +223,13 @@ def generate_context(framework_name: str) -> _ModelContext: framework_versions["tensorflow"] = get_tf_version() if openllm_core.utils.is_flax_available(): framework_versions.update({"flax": pkg.get_pkg_version("flax"), "jax": pkg.get_pkg_version("jax"), "jaxlib": pkg.get_pkg_version("jaxlib")}) return _ModelContext(framework_name=framework_name, framework_versions=framework_versions) - _TOKENIZER_PREFIX = "_tokenizer_" - def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]: """Normalize the given attrs to a model and tokenizer kwargs accordingly.""" tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)} for k in tuple(attrs.keys()): if k.startswith(_TOKENIZER_PREFIX): del attrs[k] return attrs, tokenizer_attrs - # NOTE: The set marks contains a set of modules name # that are available above and are whitelisted # to be included in the extra_objects map. @@ -254,50 +240,75 @@ _whitelist_modules = {"pkg"} _extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))} _extras["__openllm_migration__"] = {"ModelEnv": "EnvVarMixin"} _import_structure: dict[str, list[str]] = { - "analytics": [], "codegen": [], "dantic": [], "representation": ["ReprMixin"], "lazy": ["LazyModule"], - "import_utils": ["OPTIONAL_DEPENDENCIES", "ENV_VARS_TRUE_VALUES", "DummyMetaclass", "EnvVarMixin", "require_backends", - "is_cpm_kernels_available", "is_einops_available", "is_flax_available", "is_tf_available", "is_vllm_available", "is_torch_available", "is_bitsandbytes_available", "is_peft_available", "is_datasets_available", - "is_transformers_supports_kbit", "is_transformers_supports_agent", "is_jupyter_available", "is_jupytext_available", "is_notebook_available", "is_triton_available", "is_autogptq_available", "is_sentencepiece_available", - "is_xformers_available", "is_fairscale_available", "is_grpc_available", "is_grpc_health_available", "is_transformers_available"]} + "analytics": [], + "codegen": [], + "dantic": [], + "representation": ["ReprMixin"], + "lazy": ["LazyModule"], + "import_utils": [ + "OPTIONAL_DEPENDENCIES", + "ENV_VARS_TRUE_VALUES", + "DummyMetaclass", + "EnvVarMixin", + "require_backends", + "is_cpm_kernels_available", + "is_einops_available", + "is_flax_available", + "is_tf_available", + "is_vllm_available", + "is_torch_available", + "is_bitsandbytes_available", + "is_peft_available", + "is_datasets_available", + "is_transformers_supports_kbit", + "is_transformers_supports_agent", + "is_jupyter_available", + "is_jupytext_available", + "is_notebook_available", + "is_triton_available", + "is_autogptq_available", + "is_sentencepiece_available", + "is_xformers_available", + "is_fairscale_available", + "is_grpc_available", + "is_grpc_health_available", + "is_transformers_available" + ] +} if t.TYPE_CHECKING: # NOTE: The following exports useful utils from bentoml - from . import ( - analytics as analytics, - codegen as codegen, - dantic as dantic, - ) + from . import (analytics as analytics, codegen as codegen, dantic as dantic,) from .import_utils import ( - ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES, - OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES, - DummyMetaclass as DummyMetaclass, - EnvVarMixin as EnvVarMixin, - is_autogptq_available as is_autogptq_available, - is_bitsandbytes_available as is_bitsandbytes_available, - is_cpm_kernels_available as is_cpm_kernels_available, - is_datasets_available as is_datasets_available, - is_einops_available as is_einops_available, - is_fairscale_available as is_fairscale_available, - is_flax_available as is_flax_available, - is_jupyter_available as is_jupyter_available, - is_jupytext_available as is_jupytext_available, - is_notebook_available as is_notebook_available, - is_peft_available as is_peft_available, - is_sentencepiece_available as is_sentencepiece_available, - is_tf_available as is_tf_available, - is_torch_available as is_torch_available, - is_transformers_supports_agent as is_transformers_supports_agent, - is_transformers_supports_kbit as is_transformers_supports_kbit, - is_triton_available as is_triton_available, - is_vllm_available as is_vllm_available, - is_xformers_available as is_xformers_available, - is_grpc_available as is_grpc_available, - is_grpc_health_available as is_grpc_health_available, - is_transformers_available as is_transformers_available, - require_backends as require_backends, + ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES, + OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES, + DummyMetaclass as DummyMetaclass, + EnvVarMixin as EnvVarMixin, + is_autogptq_available as is_autogptq_available, + is_bitsandbytes_available as is_bitsandbytes_available, + is_cpm_kernels_available as is_cpm_kernels_available, + is_datasets_available as is_datasets_available, + is_einops_available as is_einops_available, + is_fairscale_available as is_fairscale_available, + is_flax_available as is_flax_available, + is_jupyter_available as is_jupyter_available, + is_jupytext_available as is_jupytext_available, + is_notebook_available as is_notebook_available, + is_peft_available as is_peft_available, + is_sentencepiece_available as is_sentencepiece_available, + is_tf_available as is_tf_available, + is_torch_available as is_torch_available, + is_transformers_supports_agent as is_transformers_supports_agent, + is_transformers_supports_kbit as is_transformers_supports_kbit, + is_triton_available as is_triton_available, + is_vllm_available as is_vllm_available, + is_xformers_available as is_xformers_available, + is_grpc_available as is_grpc_available, + is_grpc_health_available as is_grpc_health_available, + is_transformers_available as is_transformers_available, + require_backends as require_backends, ) from .representation import ReprMixin as ReprMixin - __lazy = LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects=_extras) __all__ = __lazy.__all__ __dir__ = __lazy.__dir__ diff --git a/openllm-core/src/openllm_core/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py index 3d861a4c..2b7d4d41 100644 --- a/openllm-core/src/openllm_core/utils/analytics.py +++ b/openllm-core/src/openllm_core/utils/analytics.py @@ -6,7 +6,6 @@ from __future__ import annotations import contextlib, functools, logging, os, re, typing as t, importlib.metadata, attr, openllm_core from bentoml._internal.utils import analytics as _internal_analytics from openllm_core._typing_compat import ParamSpec - P = ParamSpec("P") T = t.TypeVar("T") logger = logging.getLogger(__name__) @@ -14,36 +13,36 @@ logger = logging.getLogger(__name__) # This variable is a proxy that will control BENTOML_DO_NOT_TRACK OPENLLM_DO_NOT_TRACK = "OPENLLM_DO_NOT_TRACK" DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper() - @functools.lru_cache(maxsize=1) -def do_not_track() -> bool: return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES +def do_not_track() -> bool: + return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES @functools.lru_cache(maxsize=1) -def _usage_event_debugging() -> bool: return os.environ.get("__BENTOML_DEBUG_USAGE", str(False)).lower() == "true" - +def _usage_event_debugging() -> bool: + return os.environ.get("__BENTOML_DEBUG_USAGE", str(False)).lower() == "true" def silent(func: t.Callable[P, T]) -> t.Callable[P, T]: @functools.wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> t.Any: - try: return func(*args, **kwargs) + try: + return func(*args, **kwargs) except Exception as err: if _usage_event_debugging(): if openllm_core.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3) else: logger.info("Tracking Error: %s", err) else: logger.debug("Tracking Error: %s", err) - return wrapper + return wrapper @silent def track(event_properties: attr.AttrsInstance) -> None: if do_not_track(): return _internal_analytics.track(t.cast("_internal_analytics.schemas.EventMeta", event_properties)) - @contextlib.contextmanager def set_bentoml_tracking() -> t.Generator[None, None, None]: original_value = os.environ.pop(_internal_analytics.BENTOML_DO_NOT_TRACK, str(False)) try: os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = str(do_not_track()) yield - finally: os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = original_value - + finally: + os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = original_value class EventMeta: @property def event_name(self) -> str: @@ -53,7 +52,6 @@ class EventMeta: suffix_to_remove = "_event" if event_name.endswith(suffix_to_remove): event_name = event_name[:-len(suffix_to_remove)] return event_name - @attr.define class ModelSaveEvent(EventMeta): module: str @@ -71,9 +69,10 @@ class OpenllmCliEvent(EventMeta): class StartInitEvent(EventMeta): model_name: str llm_config: t.Dict[str, t.Any] = attr.field(default=None) - @staticmethod - def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump()) + @staticmethod + def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent: + return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump()) def track_start_init(llm_config: openllm_core.LLMConfig) -> None: if do_not_track(): return track(StartInitEvent.handler(llm_config)) diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py index 8cd3e630..ae31dd1f 100644 --- a/openllm-core/src/openllm_core/utils/codegen.py +++ b/openllm-core/src/openllm_core/utils/codegen.py @@ -23,21 +23,27 @@ def has_own_attribute(cls: type[t.Any], attrib_name: t.Any) -> bool: def get_annotations(cls: type[t.Any]) -> DictStrAny: if has_own_attribute(cls, "__annotations__"): return cls.__annotations__ return t.cast("DictStrAny", {}) - def is_class_var(annot: str | t.Any) -> bool: annot = str(annot) # Annotation can be quoted. if annot.startswith(("'", '"')) and annot.endswith(("'", '"')): annot = annot[1:-1] return annot.startswith(("typing.ClassVar", "t.ClassVar", "ClassVar", "typing_extensions.ClassVar",)) def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str | None = None) -> _T: - try: method_or_cls.__module__ = cls.__module__ - except AttributeError: pass - try: method_or_cls.__qualname__ = f"{cls.__qualname__}.{method_or_cls.__name__}" - except AttributeError: pass - try: method_or_cls.__doc__ = _overwrite_doc or "Generated by ``openllm.LLMConfig`` for class " f"{cls.__qualname__}." - except AttributeError: pass + try: + method_or_cls.__module__ = cls.__module__ + except AttributeError: + pass + try: + method_or_cls.__qualname__ = f"{cls.__qualname__}.{method_or_cls.__name__}" + except AttributeError: + pass + try: + method_or_cls.__doc__ = _overwrite_doc or "Generated by ``openllm.LLMConfig`` for class " f"{cls.__qualname__}." + except AttributeError: + pass return method_or_cls -def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs) +def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: + eval(compile(script, filename, "exec"), globs, locs) def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> AnyCallable: locs: DictStrAny = {} # In order of debuggers like PDB being able to step through the code, we add a fake linecache entry. @@ -52,7 +58,6 @@ def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> An count += 1 _compile_and_eval(script, globs, locs, filename) return locs[name] - def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.Any]: """Create a tuple subclass to hold class attributes. @@ -67,14 +72,16 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t. attr_class_name = f"{cls_name}Attributes" attr_class_template = [f"class {attr_class_name}(tuple):", " __slots__ = ()",] if attr_names: - for i, attr_name in enumerate(attr_names): attr_class_template.append(f" {attr_name} = _attrs_property(_attrs_itemgetter({i}))") - else: attr_class_template.append(" pass") + for i, attr_name in enumerate(attr_names): + attr_class_template.append(f" {attr_name} = _attrs_property(_attrs_itemgetter({i}))") + else: + attr_class_template.append(" pass") globs: DictStrAny = {"_attrs_itemgetter": itemgetter, "_attrs_property": property} if SHOW_CODEGEN: logger.info("Generated class for %s:\n\n%s", attr_class_name, "\n".join(attr_class_template)) _compile_and_eval("\n".join(attr_class_template), globs) return globs[attr_class_name] - -def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>" +def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: + return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>" def generate_function(typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None) -> AnyCallable: from openllm_core.utils import SHOW_CODEGEN script = "def %s(%s):\n %s\n" % (func_name, ", ".join(args) if args is not None else "", "\n ".join(lines) if lines else "pass") @@ -82,10 +89,12 @@ def generate_function(typ: type[t.Any], func_name: str, lines: list[str] | None, if annotations: meth.__annotations__ = annotations if SHOW_CODEGEN: logger.info("Generated script for %s:\n\n%s", typ, script) return meth - def make_env_transformer(cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable: from openllm_core.utils import dantic, field_env_key - def identity(_: str, x_value: t.Any) -> t.Any: return x_value + + def identity(_: str, x_value: t.Any) -> t.Any: + return x_value + default_callback = identity if default_callback is None else default_callback globs = {} if globs is None else globs globs.update({"__populate_env": dantic.env_converter, "__default_callback": default_callback, "__field_env": field_env_key, "__suffix": suffix or "", "__model_name": model_name,}) @@ -97,10 +106,14 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: from openllm_core.utils import ReprMixin if name is None: name = func.__name__.strip("_") _signatures = inspect.signature(func).parameters - def _repr(self: ReprMixin) -> str: return f"" - def _repr_args(self: ReprMixin) -> t.Iterator[t.Tuple[str, t.Any]]: return ((k, _signatures[k].annotation) for k in self.__repr_keys__) + + def _repr(self: ReprMixin) -> str: + return f"" + + def _repr_args(self: ReprMixin) -> t.Iterator[t.Tuple[str, t.Any]]: + return ((k, _signatures[k].annotation) for k in self.__repr_keys__) + if func.__doc__ is None: doc = f"Generated SDK for {func.__name__}" else: doc = func.__doc__ return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm"}),)(func, **attrs), func,)) - __all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function"] diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py index 97d320b5..87e1a3d4 100644 --- a/openllm-core/src/openllm_core/utils/dantic.py +++ b/openllm-core/src/openllm_core/utils/dantic.py @@ -3,11 +3,7 @@ from __future__ import annotations import functools, importlib, os, sys, typing as t from enum import Enum import attr, click, click_option_group as cog, inflection, orjson -from click import ( - ParamType, - shell_completion as sc, - types as click_types, -) +from click import (ParamType, shell_completion as sc, types as click_types,) if t.TYPE_CHECKING: from attr import _ValidatorType @@ -15,8 +11,8 @@ AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar("FC", bound=t.Union[AnyCallable, click.Command]) __all__ = ["FC", "attrs_to_options", "Field", "parse_type", "is_typing", "is_literal", "ModuleType", "EnumChoice", "LiteralChoice", "allows_multiple", "is_mapping", "is_container", "parse_container_args", "parse_single_arg", "CUDA", "JsonType", "BytesType"] -def __dir__() -> list[str]: return sorted(__all__) - +def __dir__() -> list[str]: + return sorted(__all__) def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any | None = None, suffix_generation: bool = False, suffix_sampling: bool = False,) -> t.Callable[[FC], FC]: # TODO: support parsing nested attrs class and Union envvar = field.metadata["env"] @@ -34,15 +30,15 @@ def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, t else: identifier = f"{model_name}_{underscored}" return cog.optgroup.option(identifier, full_option_name, type=parse_type(typ), required=field.default is attr.NOTHING, default=field.default if field.default not in (attr.NOTHING, None) else None, show_default=True, multiple=allows_multiple(typ) if typ else False, help=field.metadata.get("description", "(No description provided)"), show_envvar=True, envvar=envvar,) - def env_converter(value: t.Any, env: str | None = None) -> t.Any: if env is not None: value = os.environ.get(env, value) if value is not None and isinstance(value, str): - try: return orjson.loads(value.lower()) - except orjson.JSONDecodeError as err: raise RuntimeError(f"Failed to parse ({value!r}) from '{env}': {err}") from None + try: + return orjson.loads(value.lower()) + except orjson.JSONDecodeError as err: + raise RuntimeError(f"Failed to parse ({value!r}) from '{env}': {err}") from None return value - def Field(default: t.Any = None, *, ge: int | float | None = None, le: int | float | None = None, validator: _ValidatorType[t.Any] | None = None, description: str | None = None, env: str | None = None, auto_default: bool = False, use_default_converter: bool = True, **attrs: t.Any) -> t.Any: """A decorator that extends attr.field with additional arguments, which provides the same interface as pydantic's Field. @@ -94,7 +90,6 @@ def Field(default: t.Any = None, *, ge: int | float | None = None, le: int | flo attrs.pop("default") return attr.field(metadata=metadata, validator=_validator, converter=converter, **attrs) - def parse_type(field_type: t.Any) -> ParamType | tuple[ParamType, ...]: """Transforms the pydantic field's type into a click-compatible type. @@ -125,7 +120,6 @@ def parse_type(field_type: t.Any) -> ParamType | tuple[ParamType, ...]: if lenient_issubclass(field_type, bytes): return BytesType() # return the current type: it should be a primitive return field_type - def is_typing(field_type: type) -> bool: """Checks whether the current type is a module-like type. @@ -139,7 +133,6 @@ def is_typing(field_type: type) -> bool: if raw is None: return False if raw is type or raw is t.Type: return True return False - def is_literal(field_type: type) -> bool: """Checks whether the given field type is a Literal type or not. @@ -154,7 +147,6 @@ def is_literal(field_type: type) -> bool: """ origin = t.get_origin(field_type) return origin is not None and origin is t.Literal - class ModuleType(ParamType): name = "module" @@ -165,15 +157,17 @@ class ModuleType(ParamType): module = importlib.import_module(module_name) if class_name: - try: return getattr(module, class_name) - except AttributeError: raise ImportError(f"Module '{module_name}' does not define a '{class_name}' variable.") from None + try: + return getattr(module, class_name) + except AttributeError: + raise ImportError(f"Module '{module_name}' does not define a '{class_name}' variable.") from None def convert(self, value: str | t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any: try: if isinstance(value, str): return self._import_object(value) return value - except Exception as exc: self.fail(f"'{value}' is not a valid object ({type(exc)}: {exc!s})", param, ctx) - + except Exception as exc: + self.fail(f"'{value}' is not a valid object ({type(exc)}: {exc!s})", param, ctx) class EnumChoice(click.Choice): name = "enum" @@ -196,7 +190,6 @@ class EnumChoice(click.Choice): if isinstance(result, str): result = self.internal_type[result] return result - class LiteralChoice(EnumChoice): name = "literal" @@ -209,7 +202,6 @@ class LiteralChoice(EnumChoice): _mapping = {str(v): v for v in values} super(EnumChoice, self).__init__(list(_mapping), case_sensitive) self.internal_type = item_type - def allows_multiple(field_type: type[t.Any]) -> bool: """Checks whether the current type allows for multiple arguments to be provided as input or not. @@ -234,7 +226,6 @@ def allows_multiple(field_type: type[t.Any]) -> bool: # For the moment, only non-composite types are allowed. return not isinstance(args, tuple) return False - def is_mapping(field_type: type) -> bool: """Checks whether this field represents a dictionary or JSON object. @@ -251,7 +242,6 @@ def is_mapping(field_type: type) -> bool: origin = t.get_origin(field_type) if origin is None: return False return lenient_issubclass(origin, t.Mapping) - def is_container(field_type: type) -> bool: """Checks whether the current type is a container type ('contains' other types), like lists and tuples. @@ -270,7 +260,6 @@ def is_container(field_type: type) -> bool: # Early out for non-typing objects if origin is None: return False return lenient_issubclass(origin, t.Container) - def parse_container_args(field_type: type[t.Any]) -> ParamType | tuple[ParamType, ...]: """Parses the arguments inside a container type (lists, tuples and so on). @@ -293,7 +282,6 @@ def parse_container_args(field_type: type[t.Any]) -> ParamType | tuple[ParamType return parse_single_arg(args[0]) # Then deal with fixed-length containers: Tuple[str, int, int] return tuple(parse_single_arg(arg) for arg in args) - def parse_single_arg(arg: type) -> ParamType: """Returns the click-compatible type for container origin types. @@ -314,23 +302,26 @@ def parse_single_arg(arg: type) -> ParamType: if is_container(arg): return JsonType() if lenient_issubclass(arg, bytes): return BytesType() return click_types.convert_type(arg) - class BytesType(ParamType): name = "bytes" + def convert(self, value: t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any: if isinstance(value, bytes): return value - try: return str.encode(value) - except Exception as exc: self.fail(f"'{value}' is not a valid string ({exc!s})", param, ctx) - + try: + return str.encode(value) + except Exception as exc: + self.fail(f"'{value}' is not a valid string ({exc!s})", param, ctx) CYGWIN = sys.platform.startswith("cygwin") WIN = sys.platform.startswith("win") if sys.platform.startswith("win") and WIN: + def _get_argv_encoding() -> str: import locale return locale.getpreferredencoding() else: - def _get_argv_encoding() -> str: return getattr(sys.stdin, "encoding", None) or sys.getfilesystemencoding() + def _get_argv_encoding() -> str: + return getattr(sys.stdin, "encoding", None) or sys.getfilesystemencoding() class CudaValueType(ParamType): name = "cuda" envvar_list_splitter = "," @@ -341,6 +332,7 @@ class CudaValueType(ParamType): if "-1" in var: return var[:var.index("-1")] return var + def shell_complete(self, ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: """Return a list of :class:`~click.shell_completion.CompletionItem` objects for the incomplete value. @@ -354,25 +346,30 @@ class CudaValueType(ParamType): from openllm_core.utils import available_devices mapping = incomplete.split(self.envvar_list_splitter) if incomplete else available_devices() return [sc.CompletionItem(str(i), help=f"CUDA device index {i}") for i in mapping] + def convert(self, value: t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any: typ = click_types.convert_type(str) if isinstance(value, bytes): enc = _get_argv_encoding() - try: value = value.decode(enc) + try: + value = value.decode(enc) except UnicodeError: fs_enc = sys.getfilesystemencoding() if fs_enc != enc: - try: value = value.decode(fs_enc) - except UnicodeError: value = value.decode("utf-8", "replace") - else: value = value.decode("utf-8", "replace") + try: + value = value.decode(fs_enc) + except UnicodeError: + value = value.decode("utf-8", "replace") + else: + value = value.decode("utf-8", "replace") return tuple(typ(x, param, ctx) for x in value.split(",")) - def __repr__(self) -> str: return "STRING" - + def __repr__(self) -> str: + return "STRING" CUDA = CudaValueType() - class JsonType(ParamType): name = "json" + def __init__(self, should_load: bool = True) -> None: """Support JSON type for click.ParamType. @@ -381,7 +378,10 @@ class JsonType(ParamType): """ super().__init__() self.should_load = should_load + def convert(self, value: t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any: if isinstance(value, dict) or not self.should_load: return value - try: return orjson.loads(value) - except orjson.JSONDecodeError as exc: self.fail(f"'{value}' is not a valid JSON string ({exc!s})", param, ctx) + try: + return orjson.loads(value) + except orjson.JSONDecodeError as exc: + self.fail(f"'{value}' is not a valid JSON string ({exc!s})", param, ctx) diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index 6179dbd5..35aa07ee 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -11,7 +11,6 @@ from .representation import ReprMixin if t.TYPE_CHECKING: BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]] from openllm_core._typing_compat import LiteralRuntime - logger = logging.getLogger(__name__) OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq"} ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} @@ -20,14 +19,14 @@ USE_TF = os.environ.get("USE_TF", "AUTO").upper() USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper() FORCE_TF_AVAILABLE = os.environ.get("FORCE_TF_AVAILABLE", "AUTO").upper() - def _is_package_available(package: str) -> bool: _package_available = importlib.util.find_spec(package) is not None if _package_available: - try: importlib.metadata.version(package) - except importlib.metadata.PackageNotFoundError: _package_available = False + try: + importlib.metadata.version(package) + except importlib.metadata.PackageNotFoundError: + _package_available = False return _package_available - _torch_available = importlib.util.find_spec("torch") is not None _tf_available = importlib.util.find_spec("tensorflow") is not None _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None @@ -48,32 +47,52 @@ _autogptq_available = _is_package_available("auto_gptq") _sentencepiece_available = _is_package_available("sentencepiece") _xformers_available = _is_package_available("xformers") _fairscale_available = _is_package_available("fairscale") - -def is_transformers_available() -> bool: return _transformers_available -def is_grpc_available() -> bool: return _grpc_available -def is_grpc_health_available() -> bool: return _grpc_health_available -def is_transformers_supports_kbit() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 30) -def is_transformers_supports_agent() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 29) -def is_jupyter_available() -> bool: return _jupyter_available -def is_jupytext_available() -> bool: return _jupytext_available -def is_notebook_available() -> bool: return _notebook_available -def is_triton_available() -> bool: return _triton_available -def is_datasets_available() -> bool: return _datasets_available -def is_peft_available() -> bool: return _peft_available -def is_einops_available() -> bool: return _einops_available -def is_cpm_kernels_available() -> bool: return _cpm_kernel_available -def is_bitsandbytes_available() -> bool: return _bitsandbytes_available -def is_autogptq_available() -> bool: return _autogptq_available -def is_vllm_available() -> bool: return _vllm_available -def is_sentencepiece_available() -> bool: return _sentencepiece_available -def is_xformers_available() -> bool: return _xformers_available -def is_fairscale_available() -> bool: return _fairscale_available +def is_transformers_available() -> bool: + return _transformers_available +def is_grpc_available() -> bool: + return _grpc_available +def is_grpc_health_available() -> bool: + return _grpc_health_available +def is_transformers_supports_kbit() -> bool: + return pkg.pkg_version_info("transformers")[:2] >= (4, 30) +def is_transformers_supports_agent() -> bool: + return pkg.pkg_version_info("transformers")[:2] >= (4, 29) +def is_jupyter_available() -> bool: + return _jupyter_available +def is_jupytext_available() -> bool: + return _jupytext_available +def is_notebook_available() -> bool: + return _notebook_available +def is_triton_available() -> bool: + return _triton_available +def is_datasets_available() -> bool: + return _datasets_available +def is_peft_available() -> bool: + return _peft_available +def is_einops_available() -> bool: + return _einops_available +def is_cpm_kernels_available() -> bool: + return _cpm_kernel_available +def is_bitsandbytes_available() -> bool: + return _bitsandbytes_available +def is_autogptq_available() -> bool: + return _autogptq_available +def is_vllm_available() -> bool: + return _vllm_available +def is_sentencepiece_available() -> bool: + return _sentencepiece_available +def is_xformers_available() -> bool: + return _xformers_available +def is_fairscale_available() -> bool: + return _fairscale_available def is_torch_available() -> bool: global _torch_available if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES: if _torch_available: - try: importlib.metadata.version("torch") - except importlib.metadata.PackageNotFoundError: _torch_available = False + try: + importlib.metadata.version("torch") + except importlib.metadata.PackageNotFoundError: + _torch_available = False else: logger.info("Disabling PyTorch because USE_TF is set") _torch_available = False @@ -92,7 +111,8 @@ def is_tf_available() -> bool: try: _tf_version = importlib.metadata.version(_pkg) break - except importlib.metadata.PackageNotFoundError: pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution. + except importlib.metadata.PackageNotFoundError: + pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution. _tf_available = _tf_version is not None if _tf_available: if _tf_version and packaging.version.parse(_tf_version) < packaging.version.parse("2"): @@ -109,11 +129,11 @@ def is_flax_available() -> bool: try: importlib.metadata.version("jax") importlib.metadata.version("flax") - except importlib.metadata.PackageNotFoundError: _flax_available = False + except importlib.metadata.PackageNotFoundError: + _flax_available = False else: _flax_available = False return _flax_available - VLLM_IMPORT_ERROR_WITH_PYTORCH = """\ {0} requires the vLLM library but it was not found in your environment. However, we were able to find a PyTorch installation. PyTorch classes do not begin @@ -220,22 +240,19 @@ You can install it with pip: `pip install fairscale`. Please note that you may n your runtime after installation. """ -BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), - ("vllm", (is_vllm_available, VLLM_IMPORT_ERROR)), ("cpm_kernels", (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)), - ("triton", (is_triton_available, TRITON_IMPORT_ERROR)), ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ("peft", (is_peft_available, PEFT_IMPORT_ERROR)), - ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), ("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), - ("xformers", (is_xformers_available, XFORMERS_IMPORT_ERROR)), ("fairscale", (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))]) - +BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("vllm", (is_vllm_available, VLLM_IMPORT_ERROR)), ("cpm_kernels", (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)), ("triton", (is_triton_available, TRITON_IMPORT_ERROR)), ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ( + "peft", (is_peft_available, PEFT_IMPORT_ERROR) +), ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), ("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ("xformers", (is_xformers_available, XFORMERS_IMPORT_ERROR)), ("fairscale", (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))]) class DummyMetaclass(abc.ABCMeta): """Metaclass for dummy object. It will raises ImportError generated by ``require_backends`` if users try to access attributes from given class. """ _backends: t.List[str] + def __getattribute__(cls, key: str) -> t.Any: if key.startswith("_"): return super().__getattribute__(key) require_backends(cls, cls._backends) - def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None: if not isinstance(backends, (list, tuple)): backends = list(backends) name = o.__name__ if hasattr(o, "__name__") else o.__class__.__name__ @@ -250,7 +267,6 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None: if "flax" not in backends and is_flax_available() and not is_vllm_available(): raise ImportError(VLLM_IMPORT_ERROR_WITH_FLAX.format(name)) failed = [msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available()] if failed: raise ImportError("".join(failed)) - class EnvVarMixin(ReprMixin): model_name: str config: str @@ -259,32 +275,56 @@ class EnvVarMixin(ReprMixin): framework: str bettertransformer: str runtime: str + @overload - def __getitem__(self, item: t.Literal["config"]) -> str: ... + def __getitem__(self, item: t.Literal["config"]) -> str: + ... + @overload - def __getitem__(self, item: t.Literal["model_id"]) -> str: ... + def __getitem__(self, item: t.Literal["model_id"]) -> str: + ... + @overload - def __getitem__(self, item: t.Literal["quantize"]) -> str: ... + def __getitem__(self, item: t.Literal["quantize"]) -> str: + ... + @overload - def __getitem__(self, item: t.Literal["framework"]) -> str: ... + def __getitem__(self, item: t.Literal["framework"]) -> str: + ... + @overload - def __getitem__(self, item: t.Literal["bettertransformer"]) -> str: ... + def __getitem__(self, item: t.Literal["bettertransformer"]) -> str: + ... + @overload - def __getitem__(self, item: t.Literal["runtime"]) -> str: ... + def __getitem__(self, item: t.Literal["runtime"]) -> str: + ... + @overload - def __getitem__(self, item: t.Literal["framework_value"]) -> LiteralRuntime: ... + def __getitem__(self, item: t.Literal["framework_value"]) -> LiteralRuntime: + ... + @overload - def __getitem__(self, item: t.Literal["quantize_value"]) -> t.Literal["int8", "int4", "gptq"] | None: ... + def __getitem__(self, item: t.Literal["quantize_value"]) -> t.Literal["int8", "int4", "gptq"] | None: + ... + @overload - def __getitem__(self, item: t.Literal["model_id_value"]) -> str | None: ... + def __getitem__(self, item: t.Literal["model_id_value"]) -> str | None: + ... + @overload - def __getitem__(self, item: t.Literal["bettertransformer_value"]) -> bool: ... + def __getitem__(self, item: t.Literal["bettertransformer_value"]) -> bool: + ... + @overload - def __getitem__(self, item: t.Literal["runtime_value"]) -> t.Literal["ggml", "transformers"]: ... + def __getitem__(self, item: t.Literal["runtime_value"]) -> t.Literal["ggml", "transformers"]: + ... + def __getitem__(self, item: str | t.Any) -> t.Any: if item.endswith("_value") and hasattr(self, f"_{item}"): return object.__getattribute__(self, f"_{item}")() elif hasattr(self, item): return getattr(self, item) raise KeyError(f"Key {item} not found in {self}") + def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None: """EnvVarMixin is a mixin class that returns the value extracted from environment variables.""" from openllm_core.utils import field_env_key @@ -294,25 +334,37 @@ class EnvVarMixin(ReprMixin): self._bettertransformer = bettertransformer self._quantize = quantize self._runtime = runtime - for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}: setattr(self, att, field_env_key(self.model_name, att.upper())) + for att in {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}: + setattr(self, att, field_env_key(self.model_name, att.upper())) + def _quantize_value(self) -> t.Literal["int8", "int4", "gptq"] | None: from . import first_not_none return t.cast(t.Optional[t.Literal["int8", "int4", "gptq"]], first_not_none(os.environ.get(self["quantize"]), default=self._quantize)) + def _framework_value(self) -> LiteralRuntime: from . import first_not_none return t.cast(t.Literal["pt", "tf", "flax", "vllm"], first_not_none(os.environ.get(self["framework"]), default=self._implementation)) + def _bettertransformer_value(self) -> bool: from . import first_not_none return t.cast(bool, first_not_none(os.environ.get(self["bettertransformer"], str(False)).upper() in ENV_VARS_TRUE_VALUES, default=self._bettertransformer)) + def _model_id_value(self) -> str | None: from . import first_not_none return first_not_none(os.environ.get(self["model_id"]), default=self._model_id) + def _runtime_value(self) -> t.Literal["ggml", "transformers"]: from . import first_not_none return t.cast(t.Literal["ggml", "transformers"], first_not_none(os.environ.get(self["runtime"]), default=self._runtime)) + @property - def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"} + def __repr_keys__(self) -> set[str]: + return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"} + @property - def start_docstring(self) -> str: return getattr(openllm_core.config, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING") + def start_docstring(self) -> str: + return getattr(openllm_core.config, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING") + @property - def module(self) -> LazyLoader: return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}") + def module(self) -> LazyLoader: + return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}") diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py index ab1dcc41..daf4e560 100644 --- a/openllm-core/src/openllm_core/utils/lazy.py +++ b/openllm-core/src/openllm_core/utils/lazy.py @@ -1,6 +1,5 @@ from __future__ import annotations import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t, attr, openllm_core - __all__ = ["VersionInfo", "LazyModule"] # vendorred from attrs @functools.total_ordering @@ -10,27 +9,36 @@ class VersionInfo: minor: int = attr.field() micro: int = attr.field() releaselevel: str = attr.field() + @classmethod def from_version_string(cls, s: str) -> VersionInfo: v = s.split(".") if len(v) == 3: v.append("final") return cls(major=int(v[0]), minor=int(v[1]), micro=int(v[2]), releaselevel=v[3]) + def _ensure_tuple(self, other: VersionInfo) -> tuple[tuple[int, int, int, str], tuple[int, int, int, str]]: cmp = attr.astuple(other) if self.__class__ is other.__class__ else other if not isinstance(cmp, tuple): raise NotImplementedError if not (1 <= len(cmp) <= 4): raise NotImplementedError return t.cast(t.Tuple[int, int, int, str], attr.astuple(self)[:len(cmp)]), t.cast(t.Tuple[int, int, int, str], cmp) + def __eq__(self, other: t.Any) -> bool: - try: us, them = self._ensure_tuple(other) - except NotImplementedError: return NotImplemented + try: + us, them = self._ensure_tuple(other) + except NotImplementedError: + return NotImplemented return us == them + def __lt__(self, other: t.Any) -> bool: - try: us, them = self._ensure_tuple(other) - except NotImplementedError: return NotImplemented + try: + us, them = self._ensure_tuple(other) + except NotImplementedError: + return NotImplemented # Since alphabetically "dev0" < "final" < "post1" < "post2", we don't have to do anything special with releaselevel for now. return us < them - def __repr__(self) -> str: return "{0}.{1}.{2}".format(*attr.astuple(self)[:3]) + def __repr__(self) -> str: + return "{0}.{1}.{2}".format(*attr.astuple(self)[:3]) _sentinel, _reserved_namespace = object(), {"__openllm_migration__"} class LazyModule(types.ModuleType): # Very heavily inspired by optuna.integration._IntegrationModule: https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py @@ -52,7 +60,8 @@ class LazyModule(types.ModuleType): self._class_to_module: dict[str, str] = {} _extra_objects = {} if extra_objects is None else extra_objects for key, values in import_structure.items(): - for value in values: self._class_to_module[value] = key + for value in values: + self._class_to_module[value] = key # Needed for autocompletion in an IDE self.__all__: list[str] = list(import_structure.keys()) + list(itertools.chain(*import_structure.values())) self.__file__ = module_file @@ -62,11 +71,13 @@ class LazyModule(types.ModuleType): self._name = name self._objects = _extra_objects self._import_structure = import_structure + def __dir__(self) -> list[str]: result = t.cast("list[str]", super().__dir__()) # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir. return result + [i for i in self.__all__ if i not in result] + def __getattr__(self, name: str) -> t.Any: """Equivocal __getattr__ implementation. @@ -99,8 +110,13 @@ class LazyModule(types.ModuleType): else: raise AttributeError(f"module {self.__name__} has no attribute {name}") setattr(self, name, value) return value + def _get_module(self, module_name: str) -> types.ModuleType: - try: return importlib.import_module("." + module_name, self.__name__) - except Exception as e: raise RuntimeError(f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}") from e + try: + return importlib.import_module("." + module_name, self.__name__) + except Exception as e: + raise RuntimeError(f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}") from e + # make sure this module is picklable - def __reduce__(self) -> tuple[type[LazyModule], tuple[str, str | None, dict[str, list[str]]]]: return (self.__class__, (self._name, self.__file__, self._import_structure)) + def __reduce__(self) -> tuple[type[LazyModule], tuple[str, str | None, dict[str, list[str]]]]: + return (self.__class__, (self._name, self.__file__, self._import_structure)) diff --git a/openllm-core/src/openllm_core/utils/representation.py b/openllm-core/src/openllm_core/utils/representation.py index b7f33eb1..4433510b 100644 --- a/openllm-core/src/openllm_core/utils/representation.py +++ b/openllm-core/src/openllm_core/utils/representation.py @@ -8,23 +8,40 @@ ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None] class ReprMixin: @property @abstractmethod - def __repr_keys__(self) -> set[str]: raise NotImplementedError + def __repr_keys__(self) -> set[str]: + raise NotImplementedError + """This can be overriden by base class using this mixin.""" - def __repr__(self) -> str: return f"{self.__class__.__name__} {orjson.dumps({k: utils.bentoml_cattr.unstructure(v) if attr.has(v) else v for k, v in self.__repr_args__()}, option=orjson.OPT_INDENT_2).decode()}" + + def __repr__(self) -> str: + return f"{self.__class__.__name__} {orjson.dumps({k: utils.bentoml_cattr.unstructure(v) if attr.has(v) else v for k, v in self.__repr_args__()}, option=orjson.OPT_INDENT_2).decode()}" + """The `__repr__` for any subclass of Mixin. It will print nicely the class name with each of the fields under '__repr_keys__' as kv JSON dict. """ - def __str__(self) -> str: return self.__repr_str__(" ") + + def __str__(self) -> str: + return self.__repr_str__(" ") + """The string representation of the given Mixin subclass. It will contains all of the attributes from __repr_keys__ """ - def __repr_name__(self) -> str: return self.__class__.__name__ + + def __repr_name__(self) -> str: + return self.__class__.__name__ + """Name of the instance's class, used in __repr__.""" - def __repr_str__(self, join_str: str) -> str: return join_str.join(repr(v) if a is None else f"{a}={v!r}" for a, v in self.__repr_args__()) + + def __repr_str__(self, join_str: str) -> str: + return join_str.join(repr(v) if a is None else f"{a}={v!r}" for a, v in self.__repr_args__()) + """To be used with __str__.""" - def __repr_args__(self) -> ReprArgs: return ((k, getattr(self, k)) for k in self.__repr_keys__) + + def __repr_args__(self) -> ReprArgs: + return ((k, getattr(self, k)) for k in self.__repr_keys__) + """This can also be overriden by base class using this mixin. By default it does a getattr of the current object from __repr_keys__. diff --git a/openllm-python/pyoxidizer.bzl b/openllm-python/pyoxidizer.bzl index 01ac73d1..a0180c17 100644 --- a/openllm-python/pyoxidizer.bzl +++ b/openllm-python/pyoxidizer.bzl @@ -1,106 +1,58 @@ -# Copyright 2023 BentoML Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Entrypoint for using pyoxidizer to package openllm into standalone binary distribution.""" - VERSION = VARS["version"] APP_NAME = "openllm" DISPLAY_NAME = "OpenLLM" AUTHOR = "BentoML" - def make_msi(target_triple): - if target_triple == "x86_64-pc-windows-msvc": - arch = "x64" - elif target_triple == "i686-pc-windows-msvc": - arch = "x86" - else: - arch = "unknown" + if target_triple == "x86_64-pc-windows-msvc": + arch = "x64" + elif target_triple == "i686-pc-windows-msvc": + arch = "x86" + else: + arch = "unknown" - # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_wix_msi_builder.html - msi = WiXMSIBuilder( - id_prefix = APP_NAME, - product_name = DISPLAY_NAME, - product_version = VERSION, - product_manufacturer = AUTHOR, - arch = arch, - ) - msi.msi_filename = DISPLAY_NAME + "-" + VERSION + "-" + arch + ".msi" - msi.help_url = "https://github.com/bentoml/OpenLLM/" - msi.license_path = CWD + "/LICENSE.md" + # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_wix_msi_builder.html + msi = WiXMSIBuilder(id_prefix=APP_NAME, product_name=DISPLAY_NAME, product_version=VERSION, product_manufacturer=AUTHOR, arch=arch,) + msi.msi_filename = DISPLAY_NAME + "-" + VERSION + "-" + arch + ".msi" + msi.help_url = "https://github.com/bentoml/OpenLLM/" + msi.license_path = CWD + "/LICENSE.md" - # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_file_manifest.html - m = FileManifest() + # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_file_manifest.html + m = FileManifest() - exe_prefix = "targets/" + target_triple + "/" - m.add_path( - path = exe_prefix + APP_NAME + ".exe", - strip_prefix = exe_prefix, - ) + exe_prefix = "targets/" + target_triple + "/" + m.add_path(path=exe_prefix + APP_NAME + ".exe", strip_prefix=exe_prefix,) - msi.add_program_files_manifest(m) - - return msi + msi.add_program_files_manifest(m) + return msi def make_exe_installer(): - # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_wix_bundle_builder.html - bundle = WiXBundleBuilder( - id_prefix = APP_NAME, - name = DISPLAY_NAME, - version = VERSION, - manufacturer = AUTHOR, - ) + # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_wix_bundle_builder.html + bundle = WiXBundleBuilder(id_prefix=APP_NAME, name=DISPLAY_NAME, version=VERSION, manufacturer=AUTHOR,) - bundle.add_vc_redistributable("x64") - bundle.add_vc_redistributable("x86") + bundle.add_vc_redistributable("x64") + bundle.add_vc_redistributable("x86") - bundle.add_wix_msi_builder( - builder = make_msi("x86_64-pc-windows-msvc"), - display_internal_ui = True, - install_condition = "VersionNT64", - ) - bundle.add_wix_msi_builder( - builder = make_msi("i686-pc-windows-msvc"), - display_internal_ui = True, - install_condition = "Not VersionNT64", - ) - - return bundle + bundle.add_wix_msi_builder(builder=make_msi("x86_64-pc-windows-msvc"), display_internal_ui=True, install_condition="VersionNT64",) + bundle.add_wix_msi_builder(builder=make_msi("i686-pc-windows-msvc"), display_internal_ui=True, install_condition="Not VersionNT64",) + return bundle def make_macos_app_bundle(): - # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_macos_application_bundle_builder.html - bundle = MacOsApplicationBundleBuilder(DISPLAY_NAME) - bundle.set_info_plist_required_keys( - display_name = DISPLAY_NAME, - identifier = "com.github.bentoml." + APP_NAME, - version = VERSION, - signature = "oplm", - executable = APP_NAME, - ) + # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_macos_application_bundle_builder.html + bundle = MacOsApplicationBundleBuilder(DISPLAY_NAME) + bundle.set_info_plist_required_keys(display_name=DISPLAY_NAME, identifier="com.github.bentoml." + APP_NAME, version=VERSION, signature="oplm", executable=APP_NAME,) - # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_apple_universal_binary.html - universal = AppleUniversalBinary(APP_NAME) + # https://gregoryszorc.com/docs/pyoxidizer/main/tugger_starlark_type_apple_universal_binary.html + universal = AppleUniversalBinary(APP_NAME) - for target in ["aarch64-apple-darwin", "x86_64-apple-darwin"]: - universal.add_path("targets/" + target + "/" + APP_NAME) + for target in ["aarch64-apple-darwin", "x86_64-apple-darwin"]: + universal.add_path("targets/" + target + "/" + APP_NAME) - m = FileManifest() - m.add_file(universal.to_file_content()) - bundle.add_macos_manifest(m) + m = FileManifest() + m.add_file(universal.to_file_content()) + bundle.add_macos_manifest(m) - return bundle - -register_target("windows_installers", make_exe_installer, default = True) + return bundle +register_target("windows_installers", make_exe_installer, default=True) register_target("macos_app_bundle", make_macos_app_bundle) resolve_targets() diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 4ce32f0b..030dafd6 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -33,10 +33,31 @@ else: _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated") _import_structure: dict[str, list[str]] = { - "exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], - "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_quantisation": ["infer_quantisation_config"], "_embeddings": ["GenericEmbeddingRunnable"], - "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], - "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": [], "models.baichuan": [], "models.dolly_v2": [], "models.falcon": [], "models.flan_t5": [], "models.gpt_neox": [], "models.llama": [], "models.mpt": [], "models.opt": [], "models.stablelm": [], "models.starcoder": [] + "exceptions": [], + "models": [], + "client": [], + "bundle": [], + "playground": [], + "testing": [], + "utils": ["infer_auto_class"], + "serialisation": ["ggml", "transformers"], + "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], + "_quantisation": ["infer_quantisation_config"], + "_embeddings": ["GenericEmbeddingRunnable"], + "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], + "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], + "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], + "models.chatglm": [], + "models.baichuan": [], + "models.dolly_v2": [], + "models.falcon": [], + "models.flan_t5": [], + "models.gpt_neox": [], + "models.llama": [], + "models.mpt": [], + "models.opt": [], + "models.stablelm": [], + "models.starcoder": [] } COMPILED = _Path(__file__).suffix in (".pyd", ".so") diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py index 1db41ca9..af91e6d3 100644 --- a/openllm-python/src/openllm/_embeddings.py +++ b/openllm-python/src/openllm/_embeddings.py @@ -6,43 +6,44 @@ from bentoml._internal.frameworks.transformers import MODULE_NAME, API_VERSION from bentoml._internal.models.model import ModelOptions, ModelSignature if t.TYPE_CHECKING: import torch -_GENERIC_EMBEDDING_ID="sentence-transformers/all-MiniLM-L6-v2" -_BENTOMODEL_ID="sentence-transformers--all-MiniLM-L6-v2" - +_GENERIC_EMBEDDING_ID = "sentence-transformers/all-MiniLM-L6-v2" +_BENTOMODEL_ID = "sentence-transformers--all-MiniLM-L6-v2" def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: try: return bentoml.transformers.get(ids) except bentoml.exceptions.NotFound: model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")} with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel: - snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors","*.h5","*.ot","*.pdf","*.md",".gitattributes","LICENSE.txt"]) + snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors", "*.h5", "*.ot", "*.pdf", "*.md", ".gitattributes", "LICENSE.txt"]) return bentomodel - class GenericEmbeddingRunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") SUPPORTS_CPU_MULTI_THREADING = True + def __init__(self) -> None: self.device = "cuda" if openllm.utils.device_count() > 0 else "cpu" self._bentomodel = get_or_download() self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._bentomodel.path) self.model = transformers.AutoModel.from_pretrained(self._bentomodel.path) self.model.to(self.device) + @bentoml.Runnable.method(batchable=True, batch_dim=0) def encode(self, sentences: list[str]) -> t.Sequence[openllm.LLMEmbeddings]: import torch, torch.nn.functional as F encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(self.device) attention_mask = encoded_input["attention_mask"] # Compute token embeddings - with torch.no_grad(): model_output = self.model(**encoded_input) + with torch.no_grad(): + model_output = self.model(**encoded_input) # Perform pooling and normalize sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1) return [openllm.LLMEmbeddings(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))] + @staticmethod def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: import torch # Mean Pooling - Take attention mask into account for correct averaging - token_embeddings = model_output[0] # First element of model_output contains all token embeddings + token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) - __all__ = ["GenericEmbeddingRunnable"] diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index 714bda76..64d9f50e 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -6,14 +6,16 @@ if t.TYPE_CHECKING: import torch, openllm # reexport from transformers LogitsProcessorList = transformers.LogitsProcessorList StoppingCriteriaList = transformers.StoppingCriteriaList - class StopSequenceCriteria(transformers.StoppingCriteria): def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] self.stop_sequences, self.tokenizer = stop_sequences, tokenizer - def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool: return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences) + + def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool: + return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences) class StopOnTokens(transformers.StoppingCriteria): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool: return input_ids[0][-1] in {50278, 50279, 50277, 1, 0} + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool: + return input_ids[0][-1] in {50278, 50279, 50277, 1, 0} def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsProcessorList: generation_config = config.generation_config logits_processor = transformers.LogitsProcessorList() @@ -22,18 +24,18 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr if 1e-8 <= generation_config["top_p"]: logits_processor.append(transformers.TopPLogitsWarper(generation_config["top_p"])) if generation_config["top_k"] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config["top_k"])) return logits_processor - # NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used. SEQLEN_KEYS = ["max_sequence_length", "seq_length", "max_position_embeddings", "max_seq_len", "model_max_length"] def get_context_length(config: transformers.PretrainedConfig) -> int: rope_scaling = getattr(config, "rope_scaling", None) rope_scaling_factor = config.rope_scaling["factor"] if rope_scaling else 1.0 for key in SEQLEN_KEYS: - if getattr(config, key, None) is not None: return int(rope_scaling_factor*getattr(config,key)) + if getattr(config, key, None) is not None: return int(rope_scaling_factor * getattr(config, key)) return 2048 -def is_sentence_complete(output: str) -> bool: return output.endswith((".", "?", "!", "...", "。", "?", "!", "…", '"', "'", "”")) +def is_sentence_complete(output: str) -> bool: + return output.endswith((".", "?", "!", "...", "。", "?", "!", "…", '"', "'", "”")) def is_partial_stop(output: str, stop_str: str) -> bool: - """Check whether the output contains a partial stop str.""" - for i in range(0, min(len(output), len(stop_str))): - if stop_str.startswith(output[-i:]): return True - return False + """Check whether the output contains a partial stop str.""" + for i in range(0, min(len(output), len(stop_str))): + if stop_str.startswith(output[-i:]): return True + return False diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 54bd3678..03fc45fa 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -5,53 +5,14 @@ from pathlib import Path from huggingface_hub import hf_hub_download from bentoml._internal.models.model import ModelSignature -from openllm_core._configuration import ( - FineTuneConfig, - LLMConfig, - _object_getattribute, - _setattr_class, -) +from openllm_core._configuration import FineTuneConfig, LLMConfig, _object_getattribute, _setattr_class from ._quantisation import infer_quantisation_config from openllm_core._schema import unmarshal_vllm_outputs from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException from .models.auto import AutoConfig -from openllm_core.utils import ( - DEBUG, - ENV_VARS_TRUE_VALUES, - MYPY, - EnvVarMixin, - LazyLoader, - ReprMixin, - apply, - bentoml_cattr, - codegen, - device_count, - first_not_none, - generate_hash_from_file, - is_peft_available, - is_torch_available, - non_intrusive_setattr, - normalize_attrs_to_model_tokenizer_pair, - resolve_filepath, - validate_is_path, -) +from openllm_core.utils import DEBUG, ENV_VARS_TRUE_VALUES, MYPY, EnvVarMixin, LazyLoader, ReprMixin, apply, bentoml_cattr, codegen, device_count, first_not_none, generate_hash_from_file, is_peft_available, is_torch_available, non_intrusive_setattr, normalize_attrs_to_model_tokenizer_pair, resolve_filepath, validate_is_path from .utils import infer_auto_class -from openllm_core._typing_compat import ( - AdaptersMapping, - AdaptersTuple, - AnyCallable, - AdapterType, - LiteralRuntime, - DictStrAny, - ListStr, - LLMEmbeddings, - LLMRunnable, - LLMRunner, - ModelSignatureDict as _ModelSignatureDict, - PeftAdapterOutput, - TupleAny, - NotRequired, overload, M, T, LiteralString -) +from openllm_core._typing_compat import AdaptersMapping, AdaptersTuple, AnyCallable, AdapterType, LiteralRuntime, DictStrAny, ListStr, LLMEmbeddings, LLMRunnable, LLMRunner, ModelSignatureDict as _ModelSignatureDict, PeftAdapterOutput, TupleAny, NotRequired, overload, M, T, LiteralString if t.TYPE_CHECKING: import auto_gptq as autogptq, peft, torch, transformers, vllm @@ -72,13 +33,10 @@ class ModelSignatureDict(t.TypedDict, total=False): batch_dim: t.Union[t.Tuple[int, int], int] input_spec: NotRequired[t.Union[t.Any, t.Tuple[t.Any]]] output_spec: NotRequired[t.Any] - def normalise_model_name(name: str) -> str: return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else re.sub("[^a-zA-Z0-9]+", "-", name) - # the below is similar to peft.utils.other.CONFIG_NAME PEFT_CONFIG_NAME = "adapter_config.json" - def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapping: """Resolve the type of the PeftConfig given the adapter_map. @@ -109,9 +67,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp if _peft_type not in resolved: resolved[_peft_type] = () resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),) return resolved - _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"} - class LLMInterface(ABC, t.Generic[M, T]): """This defines the loose contract for all openllm.LLM implementations.""" @property @@ -245,23 +201,42 @@ class LLMInterface(ABC, t.Generic[M, T]): __llm_supports_generate_iterator__: bool """A boolean to determine whether models does implement ``LLM.generate_iterator``.""" if t.TYPE_CHECKING and not MYPY: - def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]], model_id: str, runtime: t.Literal["ggml", "transformers"], model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], quantize_method: t.Optional[t.Literal["int8", "int4", "gptq"]], serialisation_format: t.Literal["safetensors", "legacy"], _local: bool, **attrs: t.Any) -> None: - """Generated __attrs_init__ for openllm.LLM.""" + def __attrs_init__( + self, + config: LLMConfig, + quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]], + model_id: str, + runtime: t.Literal["ggml", "transformers"], + model_decls: TupleAny, + model_attrs: DictStrAny, + tokenizer_attrs: DictStrAny, + tag: bentoml.Tag, + adapters_mapping: t.Optional[AdaptersMapping], + model_version: t.Optional[str], + quantize_method: t.Optional[t.Literal["int8", "int4", "gptq"]], + serialisation_format: t.Literal["safetensors", "legacy"], + _local: bool, + **attrs: t.Any + ) -> None: + """Generated __attrs_init__ for openllm.LLM.""" _R = t.TypeVar("_R", covariant=True) class _import_model_wrapper(t.Generic[_R, M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: ... + def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: + ... class _load_model_wrapper(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: ... + def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: + ... class _load_tokenizer_wrapper(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: ... + def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: + ... class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T]) -> T: ... + def __call__(self, llm: LLM[M, T]) -> T: + ... class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None: ... - + def __call__(self, llm: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None: + ... _object_setattr = object.__setattr__ - # NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation. def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]: @functools.wraps(f) @@ -271,38 +246,39 @@ def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Ca decls = (*model_decls, *decls) attrs = {**model_attrs, **attrs} return f(self, *decls, trust_remote_code=trust_remote_code, **attrs) + return wrapper - _DEFAULT_TOKENIZER = "hf-internal-testing/llama-tokenizer" - -def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs: return vllm.EngineArgs(model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode="auto", tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype="auto", worker_use_ray=False) - +def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs: + return vllm.EngineArgs(model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode="auto", tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype="auto", worker_use_ray=False) def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]: @functools.wraps(f) def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: if self.__llm_implementation__ == "vllm": # TODO: Do some more processing with token_id once we support token streaming - try: return vllm.LLMEngine.from_engine_args(get_engine_args(self, tokenizer=self._bentomodel.path if self.tokenizer_id == "local" else self.tokenizer_id)) + try: + return vllm.LLMEngine.from_engine_args(get_engine_args(self, tokenizer=self._bentomodel.path if self.tokenizer_id == "local" else self.tokenizer_id)) except Exception as err: traceback.print_exc() raise OpenLLMException(f"Failed to initialise vLLMEngine due to the following error:\n{err}") from None else: (model_decls, model_attrs), _ = self.llm_parameters return f(self, *(*model_decls, *decls), **{**model_attrs, **attrs}) - return wrapper + return wrapper def _wrapped_load_tokenizer(f: _load_tokenizer_wrapper[M, T]) -> t.Callable[[LLM[M, T]], T]: @functools.wraps(f) - def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: return f(self, **{**self.llm_parameters[-1], **tokenizer_attrs}) - return wrapper + def wrapper(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: + return f(self, **{**self.llm_parameters[-1], **tokenizer_attrs}) + return wrapper def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M, T]], None]: @functools.wraps(f) def wrapper(self: LLM[M, T]) -> None: if self.__llm_implementation__ == "pt" and is_torch_available(): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") f(self) - return wrapper + return wrapper def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | Path], None]: @functools.wraps(f) def wrapper(self: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None: @@ -310,8 +286,8 @@ def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[L if self.__llm_model__ is None: raise RuntimeError("Cannot 'save_pretrained' with unload model instance.") if self.bettertransformer and self.__llm_implementation__ == "pt": _object_setattr(self, "__llm_model__", t.cast("transformers.PreTrainedModel", self.__llm_model__).reverse_bettertransformer()) f(self, save_directory, **attrs) - return wrapper + return wrapper def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable: # update docstring for given entrypoint original_fn = getattr(cls, fn, getattr(LLMInterface, fn)) @@ -323,7 +299,6 @@ def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable: """ setattr(cls, fn, original_fn) return original_fn - def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]: attributes = {"import_model": _wrapped_import_model, "load_model": _wrapped_load_model, "load_tokenizer": _wrapped_load_tokenizer, "llm_post_init": _wrapped_llm_post_init, "save_pretrained": _wrapped_save_pretrained} args: ListStr = [] @@ -356,8 +331,8 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]] lines.extend([_setattr_class(key, f"cls.{fn} is not _cached_LLMInterface_get('{fn}')"), f"__gen_docstring(cls, '{fn}')",]) anns[key] = interface_anns.get(key) return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns) - -def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"] +def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: + return generation_result[0]["outputs"][0]["text"] def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]: outputs: list[vllm.RequestOutput] = [] # TODO: support prompt_token_ids @@ -365,9 +340,7 @@ def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) - while self.model.has_unfinished_requests(): outputs.extend([r for r in self.model.step() if r.finished]) return [unmarshal_vllm_outputs(i) for i in outputs] - _AdaptersTuple: type[AdaptersTuple] = codegen.make_attr_tuple_class("AdaptersTuple", ["adapter_id", "name", "config"]) - @attr.define(slots=True, repr=False, init=False) class LLM(LLMInterface[M, T], ReprMixin): if t.TYPE_CHECKING: __name__: str @@ -405,6 +378,8 @@ class LLM(LLMInterface[M, T], ReprMixin): elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.") _make_assignment_script(cls)(cls) if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER + + # fmt: off @overload def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ... @overload @@ -525,10 +500,8 @@ class LLM(LLMInterface[M, T], ReprMixin): except Exception as err: raise OpenLLMException(f"Failed to generate a valid tag for {cfg_cls.__openllm_start_name__} with 'model_id={_model_id}' (lookup to see its traceback):\n{err}") from err - return cls( - *args, model_id=_model_id, llm_config=llm_config, quantization_config=quantization_config, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, _local=_local, bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES, - _runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.environ.get(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__), _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, **attrs - ) + return cls(*args, model_id=_model_id, llm_config=llm_config, quantization_config=quantization_config, _quantize_method=quantize, _model_version=_tag.version, _tag=_tag, _serialisation_format=serialisation, _local=_local, bettertransformer=str(first_not_none(bettertransformer, os.environ.get(cfg_cls.__openllm_env__["bettertransformer"]), default=None)).upper() in ENV_VARS_TRUE_VALUES, _runtime=first_not_none(runtime, t.cast(t.Optional[t.Literal["ggml", "transformers"]], os.environ.get(cfg_cls.__openllm_env__["runtime"])), default=cfg_cls.__openllm_runtime__), _adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None, **attrs) + # fmt: on @classmethod @apply(str.lower) @@ -568,12 +541,10 @@ class LLM(LLMInterface[M, T], ReprMixin): return f"{tag_name}:{model_version}" @classmethod - def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) + def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: + return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) - def __init__( - self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str, - _serialisation_format: t.Literal["safetensors", "legacy"], _local: bool, **attrs: t.Any, - ): + def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str, _serialisation_format: t.Literal["safetensors", "legacy"], _local: bool, **attrs: t.Any,): """Initialize the LLM with given pretrained model. > [!WARNING] @@ -685,32 +656,55 @@ class LLM(LLMInterface[M, T], ReprMixin): def __setattr__(self, attr: str, value: t.Any) -> None: if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.") super().__setattr__(attr, value) + @property - def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping + def adapters_mapping(self) -> AdaptersMapping | None: + return self._adapters_mapping + @adapters_mapping.setter - def adapters_mapping(self, value: AdaptersMapping) -> None: self._adapters_mapping = value + def adapters_mapping(self, value: AdaptersMapping) -> None: + self._adapters_mapping = value + @property - def __repr_keys__(self) -> set[str]: return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"} + def __repr_keys__(self) -> set[str]: + return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"} + def __repr_args__(self) -> ReprArgs: for k in self.__repr_keys__: if k == "config": yield k, self.config.model_dump(flatten=True) else: yield k, getattr(self, k) + @property - def model_id(self) -> str: return self._model_id + def model_id(self) -> str: + return self._model_id + @property - def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime + def runtime(self) -> t.Literal["ggml", "transformers"]: + return self._runtime + @property - def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner" + def runner_name(self) -> str: + return f"llm-{self.config['start_name']}-runner" + # NOTE: The section below defines a loose contract with langchain's LLM interface. @property - def llm_type(self) -> str: return normalise_model_name(self._model_id) + def llm_type(self) -> str: + return normalise_model_name(self._model_id) + @property - def identifying_params(self) -> DictStrAny: return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()} + def identifying_params(self) -> DictStrAny: + return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()} + @property - def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: return (self._model_decls, self._model_attrs), self._tokenizer_attrs + def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: + return (self._model_decls, self._model_attrs), self._tokenizer_attrs + @property - def tag(self) -> bentoml.Tag: return self._tag - def ensure_model_id_exists(self) -> bentoml.Model: return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format) + def tag(self) -> bentoml.Tag: + return self._tag + + def ensure_model_id_exists(self) -> bentoml.Model: + return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format) @property def _bentomodel(self) -> bentoml.Model: @@ -727,6 +721,7 @@ class LLM(LLMInterface[M, T], ReprMixin): - The attributes dictionary that will be passed into `self.postprocess_generate`. """ return self.config.sanitize_parameters(prompt, **attrs) + def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any: """This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.). @@ -874,21 +869,22 @@ class LLM(LLMInterface[M, T], ReprMixin): """ models = models if models is not None else [] - try: models.append(self._bentomodel) - except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None + try: + models.append(self._bentomodel) + except bentoml.exceptions.NotFound as err: + raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0))) generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) # NOTE: returning the two langchain API's to the runner - return llm_runner_class(self)( - llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), name=self.runner_name, embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms, - method_configs=bentoml_cattr.unstructure({"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig}), scheduling_strategy=scheduling_strategy, - ) + return llm_runner_class(self)(llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), name=self.runner_name, embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms, method_configs=bentoml_cattr.unstructure({"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig}), scheduling_strategy=scheduling_strategy,) # NOTE: Scikit API - def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return self(prompt, **attrs) + def predict(self, prompt: str, **attrs: t.Any) -> t.Any: + return self(prompt, **attrs) + def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: """Returns the generation result and format the result. @@ -908,11 +904,11 @@ class LLM(LLMInterface[M, T], ReprMixin): def generate(self, prompt: str, **attrs: t.Any) -> t.List[t.Any]: # TODO: support different generation strategies, similar to self.model.generate - for it in self.generate_iterator(prompt, **attrs): pass + for it in self.generate_iterator(prompt, **attrs): + pass return [it] - def generate_iterator(self, prompt: str, /, - *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]: + def generate_iterator(self, prompt: str, /, *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]: # NOTE: encoder-decoder models will need to implement their own generate_iterator for now # inspired from fastchat's generate_stream_func from ._generation import prepare_logits_processor, get_context_length, is_partial_stop @@ -937,7 +933,7 @@ class LLM(LLMInterface[M, T], ReprMixin): if i == 0: # prefill out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True) else: # decoding - out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values) # type: ignore[has-type] + out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values) logits = out.logits past_key_values = out.past_key_values @@ -990,7 +986,7 @@ class LLM(LLMInterface[M, T], ReprMixin): del past_key_values, out gc.collect() torch.cuda.empty_cache() - +# fmt: off @overload def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @overload @@ -1039,16 +1035,16 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo runner = infer_auto_class(implementation).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available if ensure_available is not None else init_local, **attrs) if init_local: runner.init_local(quiet=True) return runner - +# fmt: off def method_signature(sig: ModelSignature) -> ModelSignatureDict: return bentoml_cattr.unstructure(sig) class SetAdapterOutput(t.TypedDict): success: bool message: str - def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]: class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu", "cpu") SUPPORTS_CPU_MULTI_THREADING = True + def __init__(__self: _Runnable): # NOTE: The side effect of this line # is that it will load the imported model during @@ -1057,28 +1053,35 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate if self.adapters_mapping is not None: logger.info("Applying LoRA to %s...", self.runner_name) self.apply_adapter(inference_mode=True, load_adapters="all") + def set_adapter(__self: _Runnable, adapter_name: str) -> None: if self.__llm_adapter_map__ is None: raise ValueError("No adapters available for current running server.") elif not isinstance(self.model, peft.PeftModel): raise RuntimeError("Model is not a PeftModel") if adapter_name != "default": self.model.set_adapter(adapter_name) logger.info("Successfully apply LoRA layer %s", adapter_name) + @bentoml.Runnable.method(**method_signature(embeddings_sig)) - def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)] + def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: + return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)] + @bentoml.Runnable.method(**method_signature(generate_sig)) def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate(prompt, **attrs) + @bentoml.Runnable.method(**method_signature(generate_sig)) def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate(prompt, **attrs) + @bentoml.Runnable.method(**method_signature(generate_sig)) def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal["generated_text"], str]]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate_one(prompt, stop, **attrs) + @bentoml.Runnable.method(**method_signature(generate_iterator_sig)) def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]: adapter_name = attrs.pop("adapter_name", None) @@ -1094,13 +1097,13 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate return " ".join(output_text) return types.new_class(self.__class__.__name__ + "Runnable", (_Runnable,), {}, lambda ns: ns.update({"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "amd.com/gpu") if self.config["requires_gpu"] else ("nvidia.com/gpu", "amd.com/gpu", "cpu"), "__module__": self.__module__, "__doc__": self.config["env"].start_docstring})) - def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput: if not is_peft_available(): return PeftAdapterOutput(success=False, result={}, error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'") if self.__llm_adapter_map__ is None: return PeftAdapterOutput(success=False, result={}, error_msg="No adapters available for current running server.") if not isinstance(self.model, peft.PeftModel): return PeftAdapterOutput(success=False, result={}, error_msg="Model is not a PeftModel") return PeftAdapterOutput(success=True, result=self.model.peft_config, error_msg="") + def _wrapped_generate_run(__self: LLMRunner[M, T], prompt: str, **kwargs: t.Any) -> t.Any: """Wrapper for runner.generate.run() to handle the prompt and postprocessing. @@ -1128,7 +1131,9 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: """ return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt) - def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"} + def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: + return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"} + def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs: yield "runner_methods", {method.name: {"batchable": method.config.batchable, "batch_dim": method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods} yield "config", self.config.model_dump(flatten=True) @@ -1136,8 +1141,10 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: yield "runtime", self.runtime yield "llm_tag", self.tag - return types.new_class(self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({"llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters), - "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__, - "__repr_keys__": property( _wrapped_repr_keys), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None})) - + return types.new_class( + self.__class__.__name__ + "Runner", (bentoml.Runner,), + exec_body=lambda ns: ns.update({ + "llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters), "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__, "__repr_keys__": property(_wrapped_repr_keys), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None + }) + ) __all__ = ["LLMRunner", "LLMRunnable", "Runner", "LLM", "llm_runner_class", "llm_runnable_class", "LLMEmbeddings"] diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index bdc8290f..dc7fb816 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -6,17 +6,17 @@ from openllm_core._typing_compat import overload if t.TYPE_CHECKING: from ._llm import LLM from openllm_core._typing_compat import DictStrAny - autogptq, torch, transformers = LazyLoader("autogptq", globals(), "auto_gptq"), LazyLoader("torch", globals(), "torch"), LazyLoader("transformers", globals(), "transformers") logger = logging.getLogger(__name__) QuantiseMode = t.Literal["int8", "int4", "gptq"] - @overload -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: ... +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal["int8", "int4"], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]: + ... @overload -def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: ... +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal["gptq"], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: + ... def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop("llm_int8_threshhold", 6.0) @@ -52,6 +52,8 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo if not is_autogptq_available(): logger.warning("'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes.") quantisation_config = create_int8_config(int8_skip_modules) - else: quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs) - else: raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.") + else: + quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs) + else: + raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.") return quantisation_config, attrs diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 40a6553e..4df93135 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,12 +1,13 @@ +# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract" from __future__ import annotations -import os, warnings, orjson, bentoml, openllm, typing as t +import os, warnings, orjson, bentoml, openllm, openllm_core, typing as t from starlette.applications import Starlette from starlette.responses import JSONResponse from starlette.routing import Route if t.TYPE_CHECKING: from starlette.requests import Request from starlette.responses import Response - from bentoml._internal.runner.runner import RunnerMethod + from bentoml._internal.runner.runner import RunnerMethod, AbstractRunner # The following warnings from bitsandbytes, and probably not that important for users to see warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization") warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization") @@ -15,51 +16,83 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""") # openllm: model adapter map llm_config = openllm.AutoConfig.for_model(model) runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map)) -generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300) -runners: t.Sequence[bentoml.Runner] = [runner] +generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm_core.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300) # type: ignore[arg-type] # XXX: remove once bentoml.Runner is correct set with type. +runners: list[AbstractRunner] = [runner] if not runner.supports_embeddings: runners.append(generic_embedding_runner) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners) -_JsonInput=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True), "adapter_name": ""}) - +_JsonInput = bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True), "adapter_name": ""}) @svc.api(route="/v1/generate", input=_JsonInput, output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) config = qa_inputs.llm_config.model_dump() responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config}) return openllm.GenerationOutput(responses=responses, configuration=config) - -@svc.api(route="/v1/generate_stream", input=_JsonInput,output=bentoml.io.Text(content_type="text/event_stream")) +@svc.api(route="/v1/generate_stream", input=_JsonInput, output=bentoml.io.Text(content_type="text/event_stream")) async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **qa_inputs.llm_config.model_dump()) - @svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent})) def metadata_v1(_: str) -> openllm.MetadataOutput: return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent) - -@svc.api(route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), output=bentoml.io.JSON.from_sample({"embeddings": [0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076], "num_tokens": 20})) +@svc.api( + route="/v1/embeddings", + input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), + output=bentoml.io.JSON.from_sample({ + "embeddings": [ + 0.007917795330286026, + -0.014421648345887661, + 0.00481307040899992, + 0.007331526838243008, + -0.0066398633643984795, + 0.00945580005645752, + 0.0087016262114048, + -0.010709521360695362, + 0.012635177001357079, + 0.010541186667978764, + -0.00730888033285737, + -0.001783102168701589, + 0.02339819073677063, + -0.010825827717781067, + -0.015888236463069916, + 0.01876218430697918, + 0.0076906150206923485, + 0.0009032754460349679, + -0.010024012066423893, + 0.01090280432254076, + -0.008668390102684498, + 0.02070549875497818, + 0.0014594447566196322, + -0.018775740638375282, + -0.014814382418990135, + 0.01796768605709076 + ], + "num_tokens": 20 + }) +) async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput: - embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode + embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type] responses = (await embed_call.async_run(phrases))[0] return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"]) - if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): + async def hf_agent(request: Request) -> Response: json_str = await request.body() - try: input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), openllm.HfAgentInput) - except orjson.JSONDecodeError as err: raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None + try: + input_data = openllm.utils.bentoml_cattr.structure(orjson.loads(json_str), openllm.HfAgentInput) + except orjson.JSONDecodeError as err: + raise openllm.exceptions.OpenLLMException(f"Invalid JSON input received: {err}") from None stop = input_data.parameters.pop("stop", ["\n"]) - try: return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200) - except NotImplementedError: return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500) + try: + return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200) + except NotImplementedError: + return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500) hf_app = Starlette(debug=True, routes=[Route("/agent", hf_agent, methods=["POST"])]) svc.mount_asgi_app(hf_app, path="/hf") - async def list_adapter_v1(_: Request) -> Response: res: dict[str, t.Any] = {} if runner.peft_adapters["success"] is True: res["result"] = {k: v.to_dict() for k, v in runner.peft_adapters["result"].items()} res.update({"success": runner.peft_adapters["success"], "error_msg": runner.peft_adapters["error_msg"]}) return JSONResponse(res, status_code=200) - adapters_app_v1 = Starlette(debug=True, routes=[Route("/adapters", list_adapter_v1, methods=["GET"])]) svc.mount_asgi_app(adapters_app_v1, path="/v1") diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py index 7f66d890..16517ef2 100644 --- a/openllm-python/src/openllm/bundle/__init__.py +++ b/openllm-python/src/openllm/bundle/__init__.py @@ -5,26 +5,12 @@ These utilities will stay internal, and its API can be changed or updated withou from __future__ import annotations import os, typing as t from openllm_core.utils import LazyModule - _import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]} if t.TYPE_CHECKING: from . import _package as _package, oci as oci - from ._package import ( - build_editable as build_editable, - construct_docker_options as construct_docker_options, - construct_python_options as construct_python_options, - create_bento as create_bento, - ) - from .oci import ( - CONTAINER_NAMES as CONTAINER_NAMES, - RefResolver as RefResolver, - build_container as build_container, - get_base_container_name as get_base_container_name, - get_base_container_tag as get_base_container_tag, - supported_registries as supported_registries, - ) - + from ._package import build_editable as build_editable, construct_docker_options as construct_docker_options, construct_python_options as construct_python_options, create_bento as create_bento + from .oci import CONTAINER_NAMES as CONTAINER_NAMES, RefResolver as RefResolver, build_container as build_container, get_base_container_name as get_base_container_name, get_base_container_tag as get_base_container_tag, supported_registries as supported_registries __lazy = LazyModule(__name__, os.path.abspath("__file__"), _import_structure) __all__ = __lazy.__all__ __dir__ = __lazy.__dir__ diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index 8b667834..db98f1aa 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -13,11 +13,9 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralString, LiteralContainerRegistry, LiteralContainerVersionStrategy from bentoml._internal.bento import BentoStore from bentoml._internal.models.model import ModelStore - logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD" - def build_editable(path: str, package: t.Literal["openllm", "openllm_core", "openllm_client"] = "openllm") -> str | None: """Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.""" if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != "true": return None @@ -26,7 +24,7 @@ def build_editable(path: str, package: t.Literal["openllm", "openllm_core", "ope from build.env import IsolatedEnvBuilder module_location = openllm_core.utils.pkg.source_locations(package) if not module_location: raise RuntimeError("Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.") - pyproject_path = Path(module_location).parent.parent/"pyproject.toml" + pyproject_path = Path(module_location).parent.parent / "pyproject.toml" if os.path.isfile(pyproject_path.__fspath__()): logger.info("Generating built wheels for package %s...", package) with IsolatedEnvBuilder() as env: @@ -36,7 +34,6 @@ def build_editable(path: str, package: t.Literal["openllm", "openllm_core", "ope env.install(builder.build_system_requires) return builder.build("wheel", path, config_settings={"--global-option": "--quiet"}) raise RuntimeError("Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.") - def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None,) -> PythonOptions: packages = ["openllm", "scipy"] # apparently bnb misses this one if adapter_map is not None: packages += ["openllm[fine-tune]"] @@ -67,7 +64,8 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d _tf_version = importlib.metadata.version(candidate) packages.extend([f"tensorflow>={_tf_version}"]) break - except importlib.metadata.PackageNotFoundError: pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution. + except importlib.metadata.PackageNotFoundError: + pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution. else: if not openllm_core.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.") packages.extend([f'torch>={importlib.metadata.version("torch")}']) @@ -75,18 +73,12 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d built_wheels: list[str | None] = [build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")] if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)]) return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"]) - def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: from openllm.cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy()) env: openllm_core.utils.EnvVarMixin = llm.config["env"] if env["framework_value"] == "vllm": serialisation_format = "legacy" - env_dict = { - env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", - env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}", - "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, - "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'", - } + env_dict = {env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",} if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1") # We need to handle None separately here, as env from subprocess doesn't accept None value. @@ -96,43 +88,60 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"]) env_dict[_env.runtime] = _env["runtime_value"] return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template) - OPENLLM_MODEL_NAME = "# openllm: model name" OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map" class ModelNameFormatter(string.Formatter): model_keyword: LiteralString = "__model_name__" + def __init__(self, model_name: str): """The formatter that extends model_name to be formatted the 'service.py'.""" super().__init__() self.model_name = model_name - def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name}) + + def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: + return super().vformat(format_string, (), {self.model_keyword: self.model_name}) + def can_format(self, value: str) -> bool: try: self.parse(value) return True - except ValueError: return False + except ValueError: + return False class ModelIdFormatter(ModelNameFormatter): model_keyword: LiteralString = "__model_id__" class ModelAdapterMapFormatter(ModelNameFormatter): model_keyword: LiteralString = "__model_adapter_map__" - -_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py" +_service_file = Path(os.path.abspath(__file__)).parent.parent / "_service.py" def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None: from openllm_core.utils import DEBUG model_name = llm.config["model_name"] logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/")) - with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines() + with open(_service_file.__fspath__(), "r") as f: + src_contents = f.readlines() for it in src_contents: if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n") elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n") script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents) if DEBUG: logger.info("Generated script:\n%s", script) llm_fs.writetext(llm.config["service_name"], script) - @inject -def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None, - runtime: t.Literal[ "ggml", "transformers"] = "transformers", serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", container_registry: LiteralContainerRegistry = "ecr", container_version_strategy: LiteralContainerVersionStrategy = "release", - _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], _model_store: ModelStore = Provide[BentoMLContainer.model_store]) -> bentoml.Bento: +def create_bento( + bento_tag: bentoml.Tag, + llm_fs: FS, + llm: openllm.LLM[t.Any, t.Any], + workers_per_resource: str | float, + quantize: LiteralString | None, + bettertransformer: bool | None, + dockerfile_template: str | None, + adapter_map: dict[str, str | None] | None = None, + extra_dependencies: tuple[str, ...] | None = None, + runtime: t.Literal["ggml", "transformers"] = "transformers", + serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", + container_registry: LiteralContainerRegistry = "ecr", + container_version_strategy: LiteralContainerVersionStrategy = "release", + _bento_store: BentoStore = Provide[BentoMLContainer.bento_store], + _model_store: ModelStore = Provide[BentoMLContainer.model_store] +) -> bentoml.Bento: framework_envvar = llm.config["env"]["framework_value"] labels = dict(llm.identifying_params) labels.update({"_type": llm.llm_type, "_framework": framework_envvar, "start_name": llm.config["start_name"], "base_name_or_path": llm.model_id, "bundler": "openllm.bundle"}) @@ -141,16 +150,26 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A if workers_per_resource == "round_robin": workers_per_resource = 1.0 elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count()) else: - try: workers_per_resource = float(workers_per_resource) - except ValueError: raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None - elif isinstance(workers_per_resource, int): workers_per_resource = float(workers_per_resource) + try: + workers_per_resource = float(workers_per_resource) + except ValueError: + raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None + elif isinstance(workers_per_resource, int): + workers_per_resource = float(workers_per_resource) logger.info("Building Bento for '%s'", llm.config["start_name"]) # add service.py definition to this temporary folder write_service(llm, adapter_map, llm_fs) llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name}) build_config = BentoBuildConfig( - service=f"{llm.config['service_name']}:svc", name=bento_tag.name, labels=labels, description=f"OpenLLM service for {llm.config['start_name']}", include=list(llm_fs.walk.files()), exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec], + service=f"{llm.config['service_name']}:svc", + name=bento_tag.name, + labels=labels, + description=f"OpenLLM service for {llm.config['start_name']}", + include=list(llm_fs.walk.files()), + exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], + python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), + models=[llm_spec], docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy) ) diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index c7f2cd9d..52bc2bdd 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -10,7 +10,6 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy from ghapi import all from openllm_core._typing_compat import RefTuple, LiteralString - all = openllm_core.utils.LazyLoader("all", globals(), "ghapi.all") # noqa: F811 logger = logging.getLogger(__name__) @@ -29,19 +28,17 @@ _OWNER = "bentoml" _REPO = "openllm" _module_location = openllm_core.utils.pkg.source_locations("openllm") - @functools.lru_cache @openllm_core.utils.apply(str.lower) -def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg] - -def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s) -def _commit_time_range(r: int = 5) -> str: return (datetime.now(timezone.utc) - timedelta(days=r)).strftime("%Y-%m-%dT%H:%M:%SZ") - +def get_base_container_name(reg: LiteralContainerRegistry) -> str: + return _CONTAINER_REGISTRY[reg] +def _convert_version_from_string(s: str) -> VersionInfo: + return VersionInfo.from_version_string(s) +def _commit_time_range(r: int = 5) -> str: + return (datetime.now(timezone.utc) - timedelta(days=r)).strftime("%Y-%m-%dT%H:%M:%SZ") class VersionNotSupported(openllm.exceptions.OpenLLMException): """Raised when the stable release is too low that it doesn't include OpenLLM base container.""" - _RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"]) - def nightly_resolver(cls: type[RefResolver]) -> str: # NOTE: all openllm container will have sha- # This will use docker to run skopeo to determine the correct latest tag that is available @@ -53,15 +50,17 @@ def nightly_resolver(cls: type[RefResolver]) -> str: return next(f'sha-{it["sha"][:7]}' for it in commits if "[skip ci]" not in it["commit"]["message"]) # now is the correct behaviour return orjson.loads(subprocess.check_output([docker_bin, "run", "--rm", "-it", "quay.io/skopeo/stable:latest", "list-tags", "docker://ghcr.io/bentoml/openllm"]).decode().strip())["Tags"][-2] - @attr.attrs(eq=False, order=False, slots=True, frozen=True) class RefResolver: git_hash: str = attr.field() version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string) strategy: LiteralContainerVersionStrategy = attr.field() _ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO) + @classmethod - def _nightly_ref(cls) -> RefTuple: return _RefTuple((nightly_resolver(cls), "refs/heads/main", "nightly")) + def _nightly_ref(cls) -> RefTuple: + return _RefTuple((nightly_resolver(cls), "refs/heads/main", "nightly")) + @classmethod def _release_ref(cls, version_str: str | None = None) -> RefTuple: _use_base_strategy = version_str is None @@ -70,9 +69,11 @@ class RefResolver: meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release() version_str = meta["name"].lstrip("v") version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str) - else: version = ("", version_str) + else: + version = ("", version_str) if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'") return _RefTuple((*version, "release" if _use_base_strategy else "custom")) + @classmethod @functools.lru_cache(maxsize=64) def from_strategy(cls, strategy_or_version: t.Literal["release", "nightly"] | LiteralString | None = None) -> RefResolver: @@ -85,19 +86,21 @@ class RefResolver: else: logger.warning("Using custom %s. Make sure that it is at lease 0.2.12 for base container support.", strategy_or_version) return cls(*cls._release_ref(version_str=strategy_or_version)) + @property def tag(self) -> str: # NOTE: latest tag can also be nightly, but discouraged to use it. For nightly refer to use sha- if self.strategy == "latest": return "latest" elif self.strategy == "nightly": return self.git_hash else: return repr(self.version) - @functools.lru_cache(maxsize=256) -def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: return RefResolver.from_strategy(strategy).tag +def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: + return RefResolver.from_strategy(strategy).tag def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, version_strategy: LiteralContainerVersionStrategy = "release", push: bool = False, machine: bool = False) -> dict[str | LiteralContainerRegistry, str]: try: if not _BUILDER.health(): raise openllm.exceptions.Error - except (openllm.exceptions.Error, subprocess.CalledProcessError): raise RuntimeError("Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.") from None + except (openllm.exceptions.Error, subprocess.CalledProcessError): + raise RuntimeError("Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.") from None if openllm_core.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)") if not shutil.which("nvidia-container-runtime"): raise RuntimeError("NVIDIA Container Toolkit is required to compile CUDA kernel in container.") if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)") @@ -110,15 +113,16 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon try: outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm_core.utils.get_debug_mode() else "auto", quiet=machine) if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip() - except Exception as err: raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err + except Exception as err: + raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err return tags - if t.TYPE_CHECKING: CONTAINER_NAMES: dict[LiteralContainerRegistry, str] supported_registries: list[str] __all__ = ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"] -def __dir__() -> list[str]: return sorted(__all__) +def __dir__() -> list[str]: + return sorted(__all__) def __getattr__(name: str) -> t.Any: if name == "supported_registries": return functools.lru_cache(1)(lambda: list(_CONTAINER_REGISTRY))() elif name == "CONTAINER_NAMES": return _CONTAINER_REGISTRY diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index 8f0acafa..8c5e9f84 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -11,7 +11,6 @@ from . import termui if t.TYPE_CHECKING: import subprocess from openllm_core._configuration import LLMConfig - logger = logging.getLogger(__name__) P = ParamSpec("P") @@ -19,13 +18,10 @@ LiteralOutput = t.Literal["json", "pretty", "porcelain"] _AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command]) - def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: return [sc.CompletionItem(str(it.tag), help="Bento") for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {"start_name", "bundler"})] - def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: return [sc.CompletionItem(inflection.dasherize(it), help="Model") for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)] - def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny: # TODO: Support amd.com/gpu on k8s _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "") @@ -41,9 +37,7 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res environ["BENTOML_CONFIG_OPTIONS"] = _bentoml_config_options_env if DEBUG: logger.debug("Setting BENTOML_CONFIG_OPTIONS=%s", _bentoml_config_options_env) return environ - _adapter_mapping_key = "adapter_map" - def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ...] | None) -> None: if not value: return None if _adapter_mapping_key not in ctx.params: ctx.params[_adapter_mapping_key] = {} @@ -51,28 +45,20 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ... adapter_id, *adapter_name = v.rsplit(":", maxsplit=1) # try to resolve the full path if users pass in relative, # currently only support one level of resolve path with current directory - try: adapter_id = openllm.utils.resolve_user_filepath(adapter_id, os.getcwd()) - except FileNotFoundError: pass + try: + adapter_id = openllm.utils.resolve_user_filepath(adapter_id, os.getcwd()) + except FileNotFoundError: + pass ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None return None - def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command: - """Generate a 'click.Command' for any given LLM. - - Args: - group: the target ``click.Group`` to save this LLM cli under - model: The name of the model or the ``bentoml.Bento`` instance. - - Returns: - The click.Command for starting the model server - - Note that the internal commands will return the llm_config and a boolean determine - whether the server is run with GPU or not. - """ llm_config = openllm.AutoConfig.for_model(model) - command_attrs: DictStrAny = dict( - name=llm_config["model_name"], context_settings=_context_settings or termui.CONTEXT_SETTINGS, short_help=f"Start a LLMServer for '{model}'", aliases=[llm_config["start_name"]] if llm_config["name_type"] == "dasherize" else None, help=f"""\ + name=llm_config["model_name"], + context_settings=_context_settings or termui.CONTEXT_SETTINGS, + short_help=f"Start a LLMServer for '{model}'", + aliases=[llm_config["start_name"]] if llm_config["name_type"] == "dasherize" else None, + help=f"""\ {llm_config['env'].start_docstring} \b @@ -95,16 +81,14 @@ Available official model_id(s): [default: {llm_config['default_id']}] if llm_config["requires_gpu"] and openllm.utils.device_count() < 1: # NOTE: The model requires GPU, therefore we will return a dummy command - command_attrs.update({"short_help": "(Disabled because there is no GPU available)", "help": f"""{model} is currently not available to run on your local machine because it requires GPU for inference."""}) + command_attrs.update({"short_help": "(Disabled because there is no GPU available)", "help": f"{model} is currently not available to run on your local machine because it requires GPU for inference."}) return noop_command(group, llm_config, _serve_grpc, **command_attrs) @group.command(**command_attrs) @start_decorator(llm_config, serve_grpc=_serve_grpc) @click.pass_context - def start_cmd( - ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool, - serialisation_format: t.Literal["safetensors", "legacy"], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, - ) -> LLMConfig | subprocess.Popen[bytes]: + def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool, serialisation_format: t.Literal["safetensors", "legacy"], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, + ) -> LLMConfig | subprocess.Popen[bytes]: fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES: termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow") @@ -176,7 +160,6 @@ Available official model_id(s): [default: {llm_config['default_id']}] return config return start_cmd - def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, **command_attrs: t.Any) -> click.Command: context_settings = command_attrs.pop("context_settings", {}) context_settings.update({"ignore_unknown_options": True, "allow_extra_args": True}) @@ -189,7 +172,6 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, * return llm_config return noop - def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None: if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'") if quantize and llm_config.default_implementation() == "vllm": ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.") @@ -197,20 +179,21 @@ def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: Lite if requirements is not None and len(requirements) > 0: missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None] if len(missing_requirements) > 0: termui.echo(f"Make sure to have the following dependencies available: {missing_requirements}", fg="yellow") - def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]: def wrapper(fn: FC) -> t.Callable[[FC], FC]: composed = openllm.utils.compose( - llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args, - cog.optgroup.group("General LLM Options", help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), - model_id_option(factory=cog.optgroup, model_env=llm_config["env"]), - model_version_option(factory=cog.optgroup), - cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds"), - workers_per_resource_option(factory=cog.optgroup), - cors_option(factory=cog.optgroup), - fast_option(factory=cog.optgroup), - cog.optgroup.group( - "LLM Optimization Options", help="""Optimization related options. + llm_config.to_click_options, + _http_server_args if not serve_grpc else _grpc_server_args, + cog.optgroup.group("General LLM Options", help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), + model_id_option(factory=cog.optgroup, model_env=llm_config["env"]), + model_version_option(factory=cog.optgroup), + cog.optgroup.option("--server-timeout", type=int, default=None, help="Server timeout in seconds"), + workers_per_resource_option(factory=cog.optgroup), + cors_option(factory=cog.optgroup), + fast_option(factory=cog.optgroup), + cog.optgroup.group( + "LLM Optimization Options", + help="""Optimization related options. OpenLLM supports running model with [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/), k-bit quantization (8-bit, 4-bit), GPTQ quantization, PagedAttention via vLLM. @@ -220,14 +203,13 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/) - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) """, - ), - cog.optgroup.option("--device", type=openllm.utils.dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", show_envvar=True), - cog.optgroup.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers."), - quantize_option(factory=cog.optgroup, model_env=llm_config["env"]), - bettertransformer_option(factory=cog.optgroup, model_env=llm_config["env"]), - serialisation_option(factory=cog.optgroup), - cog.optgroup.group( - "Fine-tuning related options", help="""\ + ), + cog.optgroup.option("--device", type=openllm.utils.dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", show_envvar=True), + cog.optgroup.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers."), + quantize_option(factory=cog.optgroup, model_env=llm_config["env"]), + bettertransformer_option(factory=cog.optgroup, model_env=llm_config["env"]), + serialisation_option(factory=cog.optgroup), + cog.optgroup.group("Fine-tuning related options", help="""\ Note that the argument `--adapter-id` can accept the following format: - `--adapter-id /path/to/adapter` (local adapter) @@ -241,14 +223,13 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora ``` - """, - ), - cog.optgroup.option("--adapter-id", default=None, help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'", multiple=True, callback=_id_callback, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"), - click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True), + """), + cog.optgroup.option("--adapter-id", default=None, help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'", multiple=True, callback=_id_callback, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"), + click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True), ) return composed(fn) - return wrapper + return wrapper def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None: if value is None: return value if not isinstance(value, tuple): ctx.fail(f"{param} only accept multiple values, not {type(value)} (value: {value})") @@ -256,12 +237,10 @@ def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tup # NOTE: --device all is a special case if len(el) == 1 and el[0] == "all": return tuple(map(str, openllm.utils.available_devices())) return el - # NOTE: A list of bentoml option that is not needed for parsing. # NOTE: User shouldn't set '--working-dir', as OpenLLM will setup this. # NOTE: production is also deprecated _IGNORED_OPTIONS = {"working_dir", "production", "protocol_version"} - def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig]], t.Callable[[FC], FC]]: """Parsing `bentoml serve|serve-grpc` click.Option to be parsed via `openllm start`.""" from bentoml_cli.cli import cli @@ -285,10 +264,9 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig] param_decls = (*attrs.pop("opts"), *attrs.pop("secondary_opts")) f = cog.optgroup.option(*param_decls, **attrs)(f) return group(f) + return decorator - _http_server_args, _grpc_server_args = parse_serve_args(False), parse_serve_args(True) - def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | None], FC]: """General ``@click`` decorator with some sauce. @@ -298,117 +276,147 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | factory = attrs.pop("factory", click) factory_attr = attrs.pop("attr", "option") if factory_attr != "argument": attrs.setdefault("help", "General option for OpenLLM CLI.") + def decorator(f: FC | None) -> FC: callback = getattr(factory, factory_attr, None) if callback is None: raise ValueError(f"Factory {factory} has no attribute {factory_attr}.") return t.cast(FC, callback(*param_decls, **attrs)(f) if f is not None else callback(*param_decls, **attrs)) - return decorator + return decorator cli_option = functools.partial(_click_factory_type, attr="option") cli_argument = functools.partial(_click_factory_type, attr="argument") - def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = "pretty", **attrs: t.Any) -> t.Callable[[FC], FC]: output = ["json", "pretty", "porcelain"] - def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]: return [CompletionItem(it) for it in output] + + def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]: + return [CompletionItem(it) for it in output] + return cli_option("-o", "--output", "output", type=click.Choice(output), default=default_value, help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True, shell_complete=complete_output_var, **attrs)(f) def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option( - "--fast/--no-fast", show_default=True, default=False, envvar="OPENLLM_USE_LOCAL_LATEST", show_envvar=True, help="""Whether to skip checking if models is already in store. + return cli_option("--fast/--no-fast", show_default=True, default=False, envvar="OPENLLM_USE_LOCAL_LATEST", show_envvar=True, help="""Whether to skip checking if models is already in store. This is useful if you already downloaded or setup the model beforehand. - """, **attrs - )(f) -def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f) -def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f) -def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f) -def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f) -def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f) + """, **attrs)(f) +def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: + return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f) +def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: + return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f) +def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: + return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f) +def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: + return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f) +def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: + return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f) def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( - "--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM. + "--quantise", + "--quantize", + "quantize", + type=click.Choice(["int8", "int4", "gptq"]), + default=None, + envvar=model_env.quantize if model_env is not None else None, + show_envvar=model_env is not None, + help="""Dynamic quantization for running this LLM. - The following quantization strategies are supported: + The following quantization strategies are supported: - - ``int8``: ``LLM.int8`` for [8-bit](https://arxiv.org/abs/2208.07339) quantization. + - ``int8``: ``LLM.int8`` for [8-bit](https://arxiv.org/abs/2208.07339) quantization. - - ``int4``: ``SpQR`` for [4-bit](https://arxiv.org/abs/2306.03078) quantization. + - ``int4``: ``SpQR`` for [4-bit](https://arxiv.org/abs/2306.03078) quantization. - - ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323) + - ``gptq``: ``GPTQ`` [quantization](https://arxiv.org/abs/2210.17323) - > [!NOTE] that the model can also be served with quantized weights. - """ + ( - """ - > [!NOTE] that this will set the mode for serving within deployment.""" if build else "" - ) + """ - > [!NOTE] that quantization are currently only available in *PyTorch* models.""", **attrs + > [!NOTE] that the model can also be served with quantized weights. + """ + (""" + > [!NOTE] that this will set the mode for serving within deployment.""" if build else "") + """ + > [!NOTE] that quantization are currently only available in *PyTorch* models.""", + **attrs )(f) def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( - "--workers-per-resource", default=None, callback=workers_per_resource_callback, type=str, required=False, help="""Number of workers per resource assigned. + "--workers-per-resource", + default=None, + callback=workers_per_resource_callback, + type=str, + required=False, + help="""Number of workers per resource assigned. - See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy - for more information. By default, this is set to 1. + See https://docs.bentoml.org/en/latest/guides/scheduling.html#resource-scheduling-strategy + for more information. By default, this is set to 1. - > [!NOTE] ``--workers-per-resource`` will also accept the following strategies: + > [!NOTE] ``--workers-per-resource`` will also accept the following strategies: - - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. + - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. - - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``. - """ + ( - """\n - > [!NOTE] The workers value passed into 'build' will determine how the LLM can - > be provisioned in Kubernetes as well as in standalone container. This will - > ensure it has the same effect with 'openllm start --workers ...'""" if build else "" - ), **attrs + - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``. + """ + ("""\n + > [!NOTE] The workers value passed into 'build' will determine how the LLM can + > be provisioned in Kubernetes as well as in standalone container. This will + > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ""), + **attrs )(f) def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option( - "--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs - )(f) + return cli_option("--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs)(f) def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( - "--serialisation", "--serialization", "serialisation_format", type=click.Choice(["safetensors", "legacy"]), default="safetensors", show_default=True, show_envvar=True, envvar="OPENLLM_SERIALIZATION", help="""Serialisation format for save/load LLM. + "--serialisation", + "--serialization", + "serialisation_format", + type=click.Choice(["safetensors", "legacy"]), + default="safetensors", + show_default=True, + show_envvar=True, + envvar="OPENLLM_SERIALIZATION", + help="""Serialisation format for save/load LLM. - Currently the following strategies are supported: + Currently the following strategies are supported: - - ``safetensors``: This will use safetensors format, which is synonymous to + - ``safetensors``: This will use safetensors format, which is synonymous to - \b - ``safe_serialization=True``. + \b + ``safe_serialization=True``. - \b - > [!NOTE] that this format might not work for every cases, and - you can always fallback to ``legacy`` if needed. + \b + > [!NOTE] that this format might not work for every cases, and + you can always fallback to ``legacy`` if needed. - - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. - This should be used if the model doesn't yet support safetensors. + - ``legacy``: This will use PyTorch serialisation format, often as ``.bin`` files. This should be used if the model doesn't yet support safetensors. - > [!NOTE] that GGML format is working in progress. - """, **attrs + > [!NOTE] that GGML format is working in progress. + """, + **attrs )(f) def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( - "--container-registry", "container_registry", type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM. + "--container-registry", + "container_registry", + type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), + default="ecr", + show_default=True, + show_envvar=True, + envvar="OPENLLM_CONTAINER_REGISTRY", + callback=container_registry_callback, + help="""The default container registry to get the base image for building BentoLLM. - Currently, it supports 'ecr', 'ghcr.io', 'docker.io' + Currently, it supports 'ecr', 'ghcr.io', 'docker.io' - \b - > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information. - """, **attrs + \b + > [!NOTE] that in order to build the base image, you will need a GPUs to compile custom kernel. See ``openllm ext build-base-container`` for more information. + """, + **attrs )(f) - _wpr_strategies = {"round_robin", "conserved"} - def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None: if value is None: return value value = inflection.underscore(value) if value in _wpr_strategies: return value else: - try: float(value) # type: ignore[arg-type] - except ValueError: raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None + try: + float(value) # type: ignore[arg-type] + except ValueError: + raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, param) from None else: return value - def container_registry_callback(ctx: click.Context, param: click.Parameter, value: str | None) -> str | None: if value is None: return value if value not in openllm.bundle.supported_registries: raise click.BadParameter(f"Value must be one of {openllm.bundle.supported_registries}", ctx, param) diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index 512dc49e..5f38ca18 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -10,11 +10,26 @@ if t.TYPE_CHECKING: from openllm_core._configuration import LLMConfig from openllm_core._typing_compat import LiteralString, LiteralRuntime, LiteralContainerRegistry, LiteralContainerVersionStrategy from bentoml._internal.bento import BentoStore - logger = logging.getLogger(__name__) - -def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30, workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, device: tuple[str, ...] | t.Literal["all"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", - adapter_map: dict[LiteralString, str | None] | None = None, framework: LiteralRuntime | None = None, additional_args: list[str] | None = None, cors: bool = False, _serve_grpc: bool = False, __test__: bool = False, **_: t.Any) -> LLMConfig | subprocess.Popen[bytes]: +def _start( + model_name: str, + /, + *, + model_id: str | None = None, + timeout: int = 30, + workers_per_resource: t.Literal["conserved", "round_robin"] | float | None = None, + device: tuple[str, ...] | t.Literal["all"] | None = None, + quantize: t.Literal["int8", "int4", "gptq"] | None = None, + bettertransformer: bool | None = None, + runtime: t.Literal["ggml", "transformers"] = "transformers", + adapter_map: dict[LiteralString, str | None] | None = None, + framework: LiteralRuntime | None = None, + additional_args: list[str] | None = None, + cors: bool = False, + _serve_grpc: bool = False, + __test__: bool = False, + **_: t.Any +) -> LLMConfig | subprocess.Popen[bytes]: """Python API to start a LLM server. These provides one-to-one mapping to CLI arguments. For all additional arguments, pass it as string to ``additional_args``. For example, if you want to @@ -73,9 +88,31 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30 if __test__: args.append("--return-process") return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False) - @inject -def _build(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, bento_version: str | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: bool | None = None, adapter_map: dict[str, str | None] | None = None, build_ctx: str | None = None, enable_features: tuple[str, ...] | None = None, workers_per_resource: float | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", dockerfile_template: str | None = None, overwrite: bool = False, container_registry: LiteralContainerRegistry | None = None, container_version_strategy: LiteralContainerVersionStrategy | None = None, push: bool = False, containerize: bool = False, serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", additional_args: list[str] | None = None, bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento: +def _build( + model_name: str, + /, + *, + model_id: str | None = None, + model_version: str | None = None, + bento_version: str | None = None, + quantize: t.Literal["int8", "int4", "gptq"] | None = None, + bettertransformer: bool | None = None, + adapter_map: dict[str, str | None] | None = None, + build_ctx: str | None = None, + enable_features: tuple[str, ...] | None = None, + workers_per_resource: float | None = None, + runtime: t.Literal["ggml", "transformers"] = "transformers", + dockerfile_template: str | None = None, + overwrite: bool = False, + container_registry: LiteralContainerRegistry | None = None, + container_version_strategy: LiteralContainerVersionStrategy | None = None, + push: bool = False, + containerize: bool = False, + serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", + additional_args: list[str] | None = None, + bento_store: BentoStore = Provide[BentoMLContainer.bento_store] +) -> bentoml.Bento: """Package a LLM into a Bento. The LLM will be built into a BentoService with the following structure: @@ -155,7 +192,6 @@ def _build(model_name: str, /, *, model_id: str | None = None, model_version: st matched = re.match(r"__tag__:([^:\n]+:[^:\n]+)$", output.decode("utf-8").strip()) if matched is None: raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") return bentoml.get(matched.group(1), _bento_store=bento_store) - def _import_model(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", implementation: LiteralRuntime = "pt", quantize: t.Literal["int8", "int4", "gptq"] | None = None, serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors", additional_args: t.Sequence[str] | None = None) -> bentoml.Model: """Import a LLM into local store. @@ -194,12 +230,9 @@ def _import_model(model_name: str, /, *, model_id: str | None = None, model_vers if additional_args is not None: args.extend(additional_args) if quantize is not None: args.extend(["--quantize", quantize]) return import_command.main(args=args, standalone_mode=False) - def _list_models() -> dict[str, t.Any]: """List all available models within the local store.""" from .entrypoint import models_command return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False) - - start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models) __all__ = ["start", "start_grpc", "build", "import_model", "list_models"] diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py index 70dcc11b..ff8adedd 100644 --- a/openllm-python/src/openllm/cli/entrypoint.py +++ b/openllm-python/src/openllm/cli/entrypoint.py @@ -26,57 +26,12 @@ from bentoml_cli.utils import BentoMLCommandGroup, opt_callback from bentoml._internal.configuration.containers import BentoMLContainer from bentoml._internal.models.model import ModelStore from . import termui -from ._factory import ( - FC, - LiteralOutput, - _AnyCallable, - bettertransformer_option, - container_registry_option, - fast_option, - machine_option, - model_id_option, - model_name_argument, - model_version_option, - output_option, - parse_device_callback, - quantize_option, - serialisation_option, - start_command_factory, - workers_per_resource_option, -) +from ._factory import FC, LiteralOutput, _AnyCallable, bettertransformer_option, container_registry_option, fast_option, machine_option, model_id_option, model_name_argument, model_version_option, output_option, parse_device_callback, quantize_option, serialisation_option, start_command_factory, workers_per_resource_option from openllm import bundle, serialisation from openllm.exceptions import OpenLLMException -from openllm.models.auto import ( - CONFIG_MAPPING, - MODEL_FLAX_MAPPING_NAMES, - MODEL_MAPPING_NAMES, - MODEL_TF_MAPPING_NAMES, - MODEL_VLLM_MAPPING_NAMES, - AutoConfig, - AutoLLM, -) +from openllm.models.auto import CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES, AutoConfig, AutoLLM from openllm_core._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime -from openllm_core.utils import ( - DEBUG, - DEBUG_ENV_VAR, - OPTIONAL_DEPENDENCIES, - QUIET_ENV_VAR, - EnvVarMixin, - LazyLoader, - analytics, - bentoml_cattr, - compose, - configure_logging, - dantic, - first_not_none, - get_debug_mode, - get_quiet_mode, - is_torch_available, - is_transformers_supports_agent, - resolve_user_filepath, - set_debug_mode, - set_quiet_mode, -) +from openllm_core.utils import DEBUG, DEBUG_ENV_VAR, OPTIONAL_DEPENDENCIES, QUIET_ENV_VAR, EnvVarMixin, LazyLoader, analytics, bentoml_cattr, compose, configure_logging, dantic, first_not_none, get_debug_mode, get_quiet_mode, is_torch_available, is_transformers_supports_agent, resolve_user_filepath, set_debug_mode, set_quiet_mode from openllm.utils import infer_auto_class if t.TYPE_CHECKING: @@ -85,7 +40,8 @@ if t.TYPE_CHECKING: from bentoml._internal.container import DefaultBuilder from openllm_core._schema import EmbeddingsOutput from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy -else: torch = LazyLoader("torch", globals(), "torch") +else: + torch = LazyLoader("torch", globals(), "torch") P = ParamSpec("P") logger = logging.getLogger(__name__) @@ -99,25 +55,27 @@ OPENLLM_FIGLET = """\ """ ServeCommand = t.Literal["serve", "serve-grpc"] - @attr.define class GlobalOptions: cloud_context: str | None = attr.field(default=None) - def with_options(self, **attrs: t.Any) -> Self: return attr.evolve(self, **attrs) + def with_options(self, **attrs: t.Any) -> Self: + return attr.evolve(self, **attrs) GrpType = t.TypeVar("GrpType", bound=click.Group) _object_setattr = object.__setattr__ _EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "extension")) - class Extensions(click.MultiCommand): - def list_commands(self, ctx: click.Context) -> list[str]: return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith(".py") and not filename.startswith("__")]) - def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: - try: mod = __import__(f"openllm.cli.extension.{cmd_name}", None, None, ["cli"]) - except ImportError: return None - return mod.cli + def list_commands(self, ctx: click.Context) -> list[str]: + return sorted([filename[:-3] for filename in os.listdir(_EXT_FOLDER) if filename.endswith(".py") and not filename.startswith("__")]) + def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: + try: + mod = __import__(f"openllm.cli.extension.{cmd_name}", None, None, ["cli"]) + except ImportError: + return None + return mod.cli class OpenLLMCommandGroup(BentoMLCommandGroup): NUMBER_OF_COMMON_PARAMS = 5 # parameters in common_params + 1 faked group option header @@ -139,6 +97,7 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): elif debug: set_debug_mode(True) configure_logging() return f(*args, **attrs) + return wrapper @staticmethod @@ -148,7 +107,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): @functools.wraps(func) def wrapper(do_not_track: bool, *args: P.args, **attrs: P.kwargs) -> t.Any: if do_not_track: - with analytics.set_bentoml_tracking(): return func(*args, **attrs) + with analytics.set_bentoml_tracking(): + return func(*args, **attrs) start_time = time.time_ns() with analytics.set_bentoml_tracking(): if group.name is None: raise ValueError("group.name should not be None") @@ -166,16 +126,22 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): event.return_code = 2 if isinstance(e, KeyboardInterrupt) else 1 analytics.track(event) raise + return t.cast(t.Callable[Concatenate[bool, P], t.Any], wrapper) @staticmethod def exception_handling(func: t.Callable[P, t.Any], group: click.Group, **attrs: t.Any) -> t.Callable[P, t.Any]: command_name = attrs.get("name", func.__name__) + @functools.wraps(func) def wrapper(*args: P.args, **attrs: P.kwargs) -> t.Any: - try: return func(*args, **attrs) - except OpenLLMException as err: raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red")) from err - except KeyboardInterrupt: pass + try: + return func(*args, **attrs) + except OpenLLMException as err: + raise click.ClickException(click.style(f"[{group.name}] '{command_name}' failed: " + err.message, fg="red")) from err + except KeyboardInterrupt: + pass + return wrapper def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: @@ -183,13 +149,15 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): return t.cast("Extensions", extension_command).get_command(ctx, cmd_name) cmd_name = self.resolve_alias(cmd_name) if ctx.command.name in _start_mapping: - try: return _start_mapping[ctx.command.name][cmd_name] + try: + return _start_mapping[ctx.command.name][cmd_name] except KeyError: # TODO: support start from a bento try: bentoml.get(cmd_name) raise click.ClickException(f"'openllm start {cmd_name}' is currently disabled for the time being. Please let us know if you need this feature by opening an issue on GitHub.") - except bentoml.exceptions.NotFound: pass + except bentoml.exceptions.NotFound: + pass raise click.BadArgumentUsage(f"{cmd_name} is not a valid model identifier supported by OpenLLM.") from None return super().get_command(ctx, cmd_name) @@ -240,12 +208,13 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): # allow for 3 times the default spacing if len(commands): limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands) - rows: list[tuple[str, str]]= [] + rows: list[tuple[str, str]] = [] for subcommand, cmd in commands: help = cmd.get_short_help_str(limit) rows.append((subcommand, help)) if rows: - with formatter.section(_("Commands")): formatter.write_dl(rows) + with formatter.section(_("Commands")): + formatter.write_dl(rows) if len(extensions): limit = formatter.width - 6 - max(len(cmd[0]) for cmd in extensions) rows = [] @@ -253,8 +222,8 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): help = cmd.get_short_help_str(limit) rows.append((inflection.dasherize(subcommand), help)) if rows: - with formatter.section(_("Extensions")): formatter.write_dl(rows) - + with formatter.section(_("Extensions")): + formatter.write_dl(rows) @click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="openllm") @click.version_option(None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}") def cli() -> None: @@ -270,7 +239,6 @@ def cli() -> None: An open platform for operating large language models in production. Fine-tune, serve, deploy, and monitor any LLMs with ease. """ - @cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start", aliases=["start-http"]) def start_command() -> None: """Start any LLM as a REST server. @@ -280,7 +248,6 @@ def start_command() -> None: $ openllm -- ... ``` """ - @cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start-grpc") def start_grpc_command() -> None: """Start any LLM as a gRPC server. @@ -290,9 +257,7 @@ def start_grpc_command() -> None: $ openllm start-grpc -- ... ``` """ - _start_mapping = {"start": {key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING}, "start-grpc": {key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING}} - @cli.command(name="import", aliases=["download"]) @model_name_argument @click.argument("model_id", type=click.STRING, default=None, metavar="Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]", required=False) @@ -378,7 +343,6 @@ def import_command(model_name: str, model_id: str | None, converter: str | None, elif output == "json": termui.echo(orjson.dumps({"previously_setup": _previously_saved, "framework": impl, "tag": str(_ref.tag)}, option=orjson.OPT_INDENT_2).decode()) else: termui.echo(_ref.tag) return _ref - @cli.command(context_settings={"token_normalize_func": inflection.underscore}) @model_name_argument @model_id_option @@ -407,8 +371,32 @@ def import_command(model_name: str, model_id: str | None, converter: str | None, @click.option("--force-push", default=False, is_flag=True, type=click.BOOL, help="Whether to force push.") @click.pass_context def build_command( - ctx: click.Context, /, model_name: str, model_id: str | None, bento_version: str | None, overwrite: bool, output: LiteralOutput, runtime: t.Literal["ggml", "transformers"], quantize: t.Literal["int8", "int4", "gptq"] | None, enable_features: tuple[str, ...] | None, bettertransformer: bool | None, workers_per_resource: float | None, adapter_id: tuple[str, ...], - build_ctx: str | None, machine: bool, device: tuple[str, ...], model_version: str | None, dockerfile_template: t.TextIO | None, containerize: bool, push: bool, serialisation_format: t.Literal["safetensors", "legacy"], fast: bool, container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy, force_push: bool, **attrs: t.Any, + ctx: click.Context, + /, + model_name: str, + model_id: str | None, + bento_version: str | None, + overwrite: bool, + output: LiteralOutput, + runtime: t.Literal["ggml", "transformers"], + quantize: t.Literal["int8", "int4", "gptq"] | None, + enable_features: tuple[str, ...] | None, + bettertransformer: bool | None, + workers_per_resource: float | None, + adapter_id: tuple[str, ...], + build_ctx: str | None, + machine: bool, + device: tuple[str, ...], + model_version: str | None, + dockerfile_template: t.TextIO | None, + containerize: bool, + push: bool, + serialisation_format: t.Literal["safetensors", "legacy"], + fast: bool, + container_registry: LiteralContainerRegistry, + container_version_strategy: LiteralContainerVersionStrategy, + force_push: bool, + **attrs: t.Any, ) -> bentoml.Bento: """Package a given models into a Bento. @@ -488,12 +476,9 @@ def build_command( raise bentoml.exceptions.NotFound(f"Rebuilding existing Bento {bento_tag}") from None _previously_built = True except bentoml.exceptions.NotFound: - bento = bundle.create_bento( - bento_tag, llm_fs, llm, workers_per_resource=workers_per_resource, adapter_map=adapter_map, - quantize=quantize, bettertransformer=bettertransformer, extra_dependencies=enable_features, dockerfile_template=dockerfile_template_path, runtime=runtime, - container_registry=container_registry, container_version_strategy=container_version_strategy - ) - except Exception as err: raise err from None + bento = bundle.create_bento(bento_tag, llm_fs, llm, workers_per_resource=workers_per_resource, adapter_map=adapter_map, quantize=quantize, bettertransformer=bettertransformer, extra_dependencies=enable_features, dockerfile_template=dockerfile_template_path, runtime=runtime, container_registry=container_registry, container_version_strategy=container_version_strategy) + except Exception as err: + raise err from None if machine: termui.echo(f"__tag__:{bento.tag}", fg="white") elif output == "pretty": @@ -502,18 +487,23 @@ def build_command( if not _previously_built: termui.echo(f"Successfully built {bento}.", fg="green") elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg="yellow") termui.echo("📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" + f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" + "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n", fg="blue",) - elif output == "json": termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode()) - else: termui.echo(bento.tag) + elif output == "json": + termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode()) + else: + termui.echo(bento.tag) if push: BentoMLContainer.bentocloud_client.get().push_bento(bento, context=t.cast(GlobalOptions, ctx.obj).cloud_context, force=force_push) elif containerize: backend = t.cast("DefaultBuilder", os.environ.get("BENTOML_CONTAINERIZE_BACKEND", "docker")) - try: bentoml.container.health(backend) - except subprocess.CalledProcessError: raise OpenLLMException(f"Failed to use backend {backend}") from None - try: bentoml.container.build(bento.tag, backend=backend, features=("grpc", "io")) - except Exception as err: raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err + try: + bentoml.container.health(backend) + except subprocess.CalledProcessError: + raise OpenLLMException(f"Failed to use backend {backend}") from None + try: + bentoml.container.build(bento.tag, backend=backend, features=("grpc", "io")) + except Exception as err: + raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err return bento - @cli.command() @output_option @click.option("--show-available", is_flag=True, default=False, help="Show available models in local store (mutually exclusive with '-o porcelain').") @@ -601,7 +591,6 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo if show_available: json_data["local"] = local_models termui.echo(orjson.dumps(json_data, option=orjson.OPT_INDENT_2,).decode(), fg="white") ctx.exit(0) - @cli.command() @model_name_argument(required=False) @click.option("-y", "--yes", "--assume-yes", is_flag=True, help="Skip confirmation when deleting a specific model") @@ -625,7 +614,6 @@ def prune_command(model_name: str | None, yes: bool, include_bentos: bool, model if delete_confirmed: store.delete(store_item.tag) termui.echo(f"{store_item} deleted from {'model' if isinstance(store, ModelStore) else 'bento'} store.", fg="yellow") - def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, value: list[str] | str | None) -> tuple[str, bool | str] | list[str] | str | None: if value is None: return value @@ -644,11 +632,9 @@ def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, val return key, values[0] else: raise click.BadParameter(f"Invalid option format: {value}") - def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal["json", "porcelain", "pretty"] = "pretty") -> t.Callable[[FC], FC]: options = [click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000",), click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True), output_option(default_value=output_value),] return compose(*options)(f) if f is not None else compose(*options) - @cli.command() @click.argument("task", type=click.STRING, metavar="TASK") @shared_client_options @@ -668,8 +654,10 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: """ client = openllm.client.HTTPClient(endpoint, timeout=timeout) - try: client.call("metadata") - except http.client.BadStatusLine: raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None + try: + client.call("metadata") + except http.client.BadStatusLine: + raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None if agent == "hf": if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'") _memoized = {k: v[0] for k, v in _memoized.items() if v} @@ -681,7 +669,6 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: return result else: raise click.BadOptionUsage("agent", f"Unknown agent type {agent}") - @cli.command() @shared_client_options(output_value="json") @click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True) @@ -712,7 +699,6 @@ def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, time else: termui.echo(gen_embed.embeddings, fg="white") ctx.exit(0) - @cli.command() @shared_client_options @click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True) @@ -744,9 +730,7 @@ def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: in else: termui.echo(res["responses"], fg="white") ctx.exit(0) - @cli.group(cls=Extensions, hidden=True, name="extension") def extension_command() -> None: """Extension for OpenLLM CLI.""" - if __name__ == "__main__": cli() diff --git a/openllm-python/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py index 587d8d38..e53b9c59 100644 --- a/openllm-python/src/openllm/cli/extension/build_base_container.py +++ b/openllm-python/src/openllm/cli/extension/build_base_container.py @@ -4,7 +4,9 @@ from openllm.cli import termui from openllm.cli._factory import machine_option, container_registry_option if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy @click.command( - "build_base_container", context_settings=termui.CONTEXT_SETTINGS, help="""Base image builder for BentoLLM. + "build_base_container", + context_settings=termui.CONTEXT_SETTINGS, + help="""Base image builder for BentoLLM. By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04. Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``. diff --git a/openllm-python/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py index 94ec8914..0d917e95 100644 --- a/openllm-python/src/openllm/cli/extension/dive_bentos.py +++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py @@ -7,7 +7,6 @@ from openllm.cli import termui from openllm.cli._factory import bento_complete_envvar, machine_option if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore - @click.command("dive_bentos", context_settings=termui.CONTEXT_SETTINGS) @click.argument("bento", type=str, shell_complete=bento_complete_envvar) @machine_option diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py index 8facabaf..5f6e3f15 100644 --- a/openllm-python/src/openllm/cli/extension/get_containerfile.py +++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py @@ -10,7 +10,6 @@ from openllm.cli._factory import bento_complete_envvar from openllm_core.utils import bentoml_cattr if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore - @click.command("get_containerfile", context_settings=termui.CONTEXT_SETTINGS, help="Return Containerfile of any given Bento.") @click.argument("bento", type=str, shell_complete=bento_complete_envvar) @click.pass_context diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py index bd848e7b..4f372f07 100644 --- a/openllm-python/src/openllm/cli/extension/get_prompt.py +++ b/openllm-python/src/openllm/cli/extension/get_prompt.py @@ -4,9 +4,7 @@ from bentoml_cli.utils import opt_callback from openllm.cli import termui from openllm.cli._factory import model_complete_envvar, output_option, machine_option from openllm_core._prompt import process_prompt - LiteralOutput = t.Literal["json", "pretty", "porcelain"] - @click.command("get_prompt", context_settings=termui.CONTEXT_SETTINGS) @click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar) @click.argument("prompt", type=click.STRING) diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py index e8130090..8ab9f518 100644 --- a/openllm-python/src/openllm/cli/extension/list_bentos.py +++ b/openllm-python/src/openllm/cli/extension/list_bentos.py @@ -3,16 +3,12 @@ import click, inflection, orjson, bentoml, openllm from bentoml._internal.utils import human_readable_size from openllm.cli import termui from openllm.cli._factory import LiteralOutput, output_option - @click.command("list_bentos", context_settings=termui.CONTEXT_SETTINGS) @output_option(default_value="json") @click.pass_context def cli(ctx: click.Context, output: LiteralOutput) -> None: """List available bentos built by OpenLLM.""" - mapping = { - k: [{"tag": str(b.tag), "size": human_readable_size(openllm.utils.calc_dir_size(b.path)), "models": [{"tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]} - for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) - } + mapping = {k: [{"tag": str(b.tag), "size": human_readable_size(openllm.utils.calc_dir_size(b.path)), "models": [{"tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]} for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())} mapping = {k: v for k, v in mapping.items() if v} if output == "pretty": import tabulate diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py index 5f61609b..7aa1e5ff 100644 --- a/openllm-python/src/openllm/cli/extension/list_models.py +++ b/openllm-python/src/openllm/cli/extension/list_models.py @@ -1,11 +1,10 @@ from __future__ import annotations -import typing as t, bentoml, openllm, orjson, inflection ,click +import typing as t, bentoml, openllm, orjson, inflection, click from openllm.cli import termui from bentoml._internal.utils import human_readable_size from openllm.cli._factory import LiteralOutput, model_name_argument, output_option, model_complete_envvar if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny - @click.command("list_models", context_settings=termui.CONTEXT_SETTINGS) @model_name_argument(required=False, shell_complete=model_complete_envvar) @output_option(default_value="json") diff --git a/openllm-python/src/openllm/cli/extension/playground.py b/openllm-python/src/openllm/cli/extension/playground.py index 42343ccd..2ca824b1 100644 --- a/openllm-python/src/openllm/cli/extension/playground.py +++ b/openllm-python/src/openllm/cli/extension/playground.py @@ -7,15 +7,12 @@ from openllm_core.utils import is_jupyter_available, is_jupytext_available, is_n if t.TYPE_CHECKING: import jupytext, nbformat from openllm_core._typing_compat import DictStrAny - logger = logging.getLogger(__name__) - def load_notebook_metadata() -> DictStrAny: with open(os.path.join(os.path.dirname(playground.__file__), "_meta.yml"), "r") as f: content = yaml.safe_load(f) if not all("description" in k for k in content.values()): raise ValueError("Invalid metadata file. All entries must have a 'description' key.") return content - @click.command("playground", context_settings=termui.CONTEXT_SETTINGS) @click.argument("output-dir", default=None, required=False) @click.option("--port", envvar="JUPYTER_PORT", show_envvar=True, show_default=True, default=8888, help="Default port for Jupyter server") diff --git a/openllm-python/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py index c6ec9a59..8cb77442 100644 --- a/openllm-python/src/openllm/cli/termui.py +++ b/openllm-python/src/openllm/cli/termui.py @@ -1,11 +1,9 @@ from __future__ import annotations import os, typing as t, click, inflection, openllm if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny - def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None: attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None if not openllm.utils.get_quiet_mode(): t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs) - COLUMNS: int = int(os.environ.get("COLUMNS", str(120))) CONTEXT_SETTINGS: DictStrAny = {"help_option_names": ["-h", "--help"], "max_content_width": COLUMNS, "token_normalize_func": inflection.underscore} __all__ = ["echo", "COLUMNS", "CONTEXT_SETTINGS"] diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py index 4b94ad61..66ebdd54 100644 --- a/openllm-python/src/openllm/client.py +++ b/openllm-python/src/openllm/client.py @@ -13,5 +13,7 @@ client.embed("What is the difference between gather and scatter?") from __future__ import annotations import openllm_client, typing as t if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient -def __dir__() -> t.Sequence[str]: return sorted(dir(openllm_client)) -def __getattr__(it: str) -> t.Any: return getattr(openllm_client, it) +def __dir__() -> t.Sequence[str]: + return sorted(dir(openllm_client)) +def __getattr__(it: str) -> t.Any: + return getattr(openllm_client, it) diff --git a/openllm-python/src/openllm/models/auto/__init__.py b/openllm-python/src/openllm/models/auto/__init__.py index 0552bcec..34d2b858 100644 --- a/openllm-python/src/openllm/models/auto/__init__.py +++ b/openllm-python/src/openllm/models/auto/__init__.py @@ -3,7 +3,6 @@ import typing as t, os import openllm from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES - _import_structure: dict[str, list[str]] = {"modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]} if t.TYPE_CHECKING: from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES @@ -12,30 +11,34 @@ if t.TYPE_CHECKING: from .modeling_vllm_auto import MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES try: if not is_torch_available(): raise openllm.exceptions.MissingDependencyError -except openllm.exceptions.MissingDependencyError: pass +except openllm.exceptions.MissingDependencyError: + pass else: _import_structure["modeling_auto"].extend(["AutoLLM", "MODEL_MAPPING"]) if t.TYPE_CHECKING: from .modeling_auto import MODEL_MAPPING as MODEL_MAPPING, AutoLLM as AutoLLM try: if not is_vllm_available(): raise openllm.exceptions.MissingDependencyError -except openllm.exceptions.MissingDependencyError: pass +except openllm.exceptions.MissingDependencyError: + pass else: _import_structure["modeling_vllm_auto"].extend(["AutoVLLM", "MODEL_VLLM_MAPPING"]) if t.TYPE_CHECKING: from .modeling_vllm_auto import MODEL_VLLM_MAPPING as MODEL_VLLM_MAPPING, AutoVLLM as AutoVLLM try: if not is_flax_available(): raise openllm.exceptions.MissingDependencyError -except openllm.exceptions.MissingDependencyError: pass +except openllm.exceptions.MissingDependencyError: + pass else: _import_structure["modeling_flax_auto"].extend(["AutoFlaxLLM", "MODEL_FLAX_MAPPING"]) if t.TYPE_CHECKING: from .modeling_flax_auto import MODEL_FLAX_MAPPING as MODEL_FLAX_MAPPING, AutoFlaxLLM as AutoFlaxLLM try: if not is_tf_available(): raise openllm.exceptions.MissingDependencyError -except openllm.exceptions.MissingDependencyError: pass +except openllm.exceptions.MissingDependencyError: + pass else: _import_structure["modeling_tf_auto"].extend(["AutoTFLLM", "MODEL_TF_MAPPING"]) if t.TYPE_CHECKING: from .modeling_tf_auto import MODEL_TF_MAPPING as MODEL_TF_MAPPING, AutoTFLLM as AutoTFLLM -__lazy=LazyModule(__name__, os.path.abspath("__file__"), _import_structure) -__all__=__lazy.__all__ -__dir__=__lazy.__dir__ -__getattr__=__lazy.__getattr__ +__lazy = LazyModule(__name__, os.path.abspath("__file__"), _import_structure) +__all__ = __lazy.__all__ +__dir__ = __lazy.__dir__ +__getattr__ = __lazy.__getattr__ diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py index 48274184..f6a64540 100644 --- a/openllm-python/src/openllm/models/auto/factory.py +++ b/openllm-python/src/openllm/models/auto/factory.py @@ -16,10 +16,12 @@ if t.TYPE_CHECKING: ConfigModelItemsView = _odict_items[type[openllm.LLMConfig], type[openllm.LLM[t.Any, t.Any]]] logger = logging.getLogger(__name__) - class BaseAutoLLMClass: _model_mapping: t.ClassVar[_LazyAutoMapping] - def __init__(self, *args: t.Any, **attrs: t.Any): raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.") + + def __init__(self, *args: t.Any, **attrs: t.Any): + raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.") + @classmethod def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False, **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]: """The lower level API for creating a LLM instance. @@ -32,6 +34,7 @@ class BaseAutoLLMClass: llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs) if ensure_available: llm.ensure_model_id_exists() return llm + @classmethod def create_runner(cls, model: str, model_id: str | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: """Create a LLM Runner for the given model name. @@ -46,8 +49,10 @@ class BaseAutoLLMClass: """ runner_kwargs_name = set(inspect.signature(openllm.LLM[t.Any, t.Any].to_runner).parameters) runner_attrs = {k: v for k, v in attrs.items() if k in runner_kwargs_name} - for k in runner_attrs: del attrs[k] + for k in runner_attrs: + del attrs[k] return cls.for_model(model, model_id=model_id, **attrs).to_runner(**runner_attrs) + @classmethod def register(cls, config_class: type[openllm.LLMConfig], llm_class: type[openllm.LLM[t.Any, t.Any]]) -> None: """Register a new model for this class. @@ -59,12 +64,12 @@ class BaseAutoLLMClass: if hasattr(llm_class, "config_class") and llm_class.config_class is not config_class: raise ValueError(f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!") cls._model_mapping.register(config_class, llm_class) + @classmethod def infer_class_from_name(cls, name: str) -> type[openllm.LLM[t.Any, t.Any]]: config_class = openllm.AutoConfig.infer_class_from_name(name) if config_class in cls._model_mapping: return cls._model_mapping[config_class] raise ValueError(f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])}).") - def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any: if attr is None: return if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr) @@ -72,10 +77,11 @@ def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any: # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the object at the top level. openllm_module = importlib.import_module("openllm") if module != openllm_module: - try: return getattribute_from_module(openllm_module, attr) - except ValueError: raise ValueError(f"Could not find {attr} neither in {module} nor in {openllm_module}!") from None + try: + return getattribute_from_module(openllm_module, attr) + except ValueError: + raise ValueError(f"Could not find {attr} neither in {module} nor in {openllm_module}!") from None raise ValueError(f"Could not find {attr} in {openllm_module}!") - class _LazyAutoMapping(OrderedDict, ReprMixin): """Based on transformers.models.auto.configuration_auto._LazyAutoMapping. @@ -88,6 +94,7 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): self._model_mapping = model_mapping self._extra_content: dict[t.Any, t.Any] = {} self._modules: dict[str, types.ModuleType] = {} + def __getitem__(self, key: type[openllm.LLMConfig]) -> type[openllm.LLM[t.Any, t.Any]]: if key in self._extra_content: return self._extra_content[key] model_type = self._reverse_config_mapping[key.__name__] @@ -97,24 +104,45 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): for mtype in model_types: if mtype in self._model_mapping: return self._load_attr_from_module(mtype, self._model_mapping[mtype]) raise KeyError(key) + def _load_attr_from_module(self, model_type: str, attr: str) -> t.Any: module_name = inflection.underscore(model_type) if module_name not in self._modules: self._modules[module_name] = importlib.import_module(f".{module_name}", "openllm.models") return getattribute_from_module(self._modules[module_name], attr) - def __len__(self) -> int: return len(set(self._config_mapping.keys()).intersection(self._model_mapping.keys())) + len(self._extra_content) + + def __len__(self) -> int: + return len(set(self._config_mapping.keys()).intersection(self._model_mapping.keys())) + len(self._extra_content) + @property - def __repr_keys__(self) -> set[str]: return set(self._config_mapping.keys()) - def __repr__(self) -> str: return ReprMixin.__repr__(self) - def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]: yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping) - def __bool__(self) -> bool: return bool(self.keys()) - def keys(self) -> ConfigModelKeysView: return t.cast("ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())) - def values(self) -> ConfigModelValuesView: return t.cast("ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values())) - def items(self) -> ConfigModelItemsView: return t.cast("ConfigModelItemsView", [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())) - def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]: return iter(t.cast("SupportsIter[t.Iterator[type[openllm.LLMConfig]]]", self.keys())) + def __repr_keys__(self) -> set[str]: + return set(self._config_mapping.keys()) + + def __repr__(self) -> str: + return ReprMixin.__repr__(self) + + def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]: + yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping) + + def __bool__(self) -> bool: + return bool(self.keys()) + + def keys(self) -> ConfigModelKeysView: + return t.cast("ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())) + + def values(self) -> ConfigModelValuesView: + return t.cast("ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values())) + + def items(self) -> ConfigModelItemsView: + return t.cast("ConfigModelItemsView", [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())) + + def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]: + return iter(t.cast("SupportsIter[t.Iterator[type[openllm.LLMConfig]]]", self.keys())) + def __contains__(self, item: t.Any) -> bool: if item in self._extra_content: return True if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: return False return self._reverse_config_mapping[item.__name__] in self._model_mapping + def register(self, key: t.Any, value: t.Any) -> None: if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: if self._reverse_config_mapping[key.__name__] in self._model_mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM model.") diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py index 9b08b8e1..f6a12d3e 100644 --- a/openllm-python/src/openllm/models/auto/modeling_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_auto.py @@ -3,7 +3,6 @@ import typing as t from collections import OrderedDict from .factory import BaseAutoLLMClass, _LazyAutoMapping from openllm_core.config import CONFIG_MAPPING_NAMES - MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")]) MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) class AutoLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py index 4d1d7e98..18f316d5 100644 --- a/openllm-python/src/openllm/models/auto/modeling_flax_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_flax_auto.py @@ -3,7 +3,6 @@ import typing as t from collections import OrderedDict from .factory import BaseAutoLLMClass, _LazyAutoMapping from openllm_core.config import CONFIG_MAPPING_NAMES - MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5"), ("opt", "FlaxOPT")]) MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES) class AutoFlaxLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py index 0d5b4ed9..5ef52aef 100644 --- a/openllm-python/src/openllm/models/auto/modeling_tf_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_tf_auto.py @@ -3,7 +3,6 @@ import typing as t from collections import OrderedDict from .factory import BaseAutoLLMClass, _LazyAutoMapping from openllm_core.config import CONFIG_MAPPING_NAMES - MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5"), ("opt", "TFOPT")]) MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES) class AutoTFLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py index 94b1bae5..516e737b 100644 --- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py @@ -3,7 +3,6 @@ import typing as t from collections import OrderedDict from .factory import BaseAutoLLMClass, _LazyAutoMapping from openllm_core.config import CONFIG_MAPPING_NAMES - MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")]) MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES) class AutoVLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/baichuan/__init__.py b/openllm-python/src/openllm/models/baichuan/__init__.py index cabb57e7..d919bbf1 100644 --- a/openllm-python/src/openllm/models/baichuan/__init__.py +++ b/openllm-python/src/openllm/models/baichuan/__init__.py @@ -2,22 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available, is_vllm_available -from openllm_core.config.configuration_baichuan import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING, - BaichuanConfig as BaichuanConfig, -) - +from openllm_core.config.configuration_baichuan import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING, BaichuanConfig as BaichuanConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_baichuan"] = ["Baichuan"] if t.TYPE_CHECKING: from .modeling_baichuan import Baichuan as Baichuan try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_baichuan"] = ["VLLMBaichuan"] if t.TYPE_CHECKING: from .modeling_vllm_baichuan import VLLMBaichuan as VLLMBaichuan diff --git a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py index c9fecdcf..f37fa864 100644 --- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py +++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py @@ -1,9 +1,9 @@ from __future__ import annotations import typing as t, openllm if t.TYPE_CHECKING: import transformers - class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]): __openllm_internal__ = True + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) diff --git a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py index 602c361f..59cc4b99 100644 --- a/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py +++ b/openllm-python/src/openllm/models/baichuan/modeling_vllm_baichuan.py @@ -1,7 +1,6 @@ from __future__ import annotations import typing as t, openllm if t.TYPE_CHECKING: import vllm, transformers - class VLLMBaichuan(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]): __openllm_internal__ = True tokenizer_id = "local" diff --git a/openllm-python/src/openllm/models/chatglm/__init__.py b/openllm-python/src/openllm/models/chatglm/__init__.py index dc194a18..e9e8d207 100644 --- a/openllm-python/src/openllm/models/chatglm/__init__.py +++ b/openllm-python/src/openllm/models/chatglm/__init__.py @@ -2,16 +2,12 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available -from openllm_core.config.configuration_chatglm import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING, - ChatGLMConfig as ChatGLMConfig, -) - +from openllm_core.config.configuration_chatglm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING, ChatGLMConfig as ChatGLMConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_chatglm"] = ["ChatGLM"] if t.TYPE_CHECKING: from .modeling_chatglm import ChatGLM as ChatGLM diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py index ddf54a62..e25fa03e 100644 --- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py +++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py @@ -3,6 +3,7 @@ import typing as t, openllm if t.TYPE_CHECKING: import transformers class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]): __openllm_internal__ = True + def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]: import torch with torch.inference_mode(): @@ -10,6 +11,7 @@ class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrain # Only use half precision if the model is not yet quantized if self.config.use_half_precision: self.model.half() return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config()) + def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: import torch, torch.nn.functional as F embeddings: list[list[float]] = [] diff --git a/openllm-python/src/openllm/models/dolly_v2/__init__.py b/openllm-python/src/openllm/models/dolly_v2/__init__.py index 77b33c86..a655f7cd 100644 --- a/openllm-python/src/openllm/models/dolly_v2/__init__.py +++ b/openllm-python/src/openllm/models/dolly_v2/__init__.py @@ -2,22 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_dolly_v2 import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING, - DollyV2Config as DollyV2Config, -) - +from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING, DollyV2Config as DollyV2Config _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_dolly_v2"] = ["DollyV2"] if t.TYPE_CHECKING: from .modeling_dolly_v2 import DollyV2 as DollyV2 try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_dolly_v2"] = ["VLLMDollyV2"] if t.TYPE_CHECKING: from .modeling_vllm_dolly_v2 import VLLMDollyV2 as VLLMDollyV2 diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py index fdc5bcd0..d63b59ba 100644 --- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -6,15 +6,18 @@ from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow") logger = logging.getLogger(__name__) - @overload -def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: ... +def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: + ... @overload -def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: ... +def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[False] = ..., **attrs: t.Any) -> type[transformers.Pipeline]: + ... def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: bool = False, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline: # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information. class InstructionTextGenerationPipeline(transformers.Pipeline): - def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs) + def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): + super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs) + def _sanitize_parameters(self, return_full_text: bool | None = None, **generate_kwargs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any], dict[str, t.Any]]: if t.TYPE_CHECKING: assert self.tokenizer is not None preprocess_params: dict[str, t.Any] = {} @@ -29,11 +32,13 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr end_key_token_id = get_special_token_id(self.tokenizer, END_KEY) # Ensure generation stops once it generates "### End" generate_kwargs["eos_token_id"] = end_key_token_id - except ValueError: pass + except ValueError: + pass forward_params = generate_kwargs postprocess_params = {"response_key_token_id": response_key_token_id, "end_key_token_id": end_key_token_id} if return_full_text is not None: postprocess_params["return_full_text"] = return_full_text return preprocess_params, forward_params, postprocess_params + def preprocess(self, input_: str, **generate_kwargs: t.Any) -> t.Dict[str, t.Any]: if t.TYPE_CHECKING: assert self.tokenizer is not None prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=input_) @@ -41,6 +46,7 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr inputs["prompt_text"] = prompt_text inputs["instruction_text"] = input_ return t.cast(t.Dict[str, t.Any], inputs) + def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput: if t.TYPE_CHECKING: assert self.tokenizer is not None input_ids, attention_mask = input_tensors["input_ids"], input_tensors.get("attention_mask", None) @@ -52,6 +58,7 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])) instruction_text = input_tensors.pop("instruction_text") return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text} + def postprocess(self, model_outputs: dict[str, t.Any], response_key_token_id: int, end_key_token_id: int, return_full_text: bool = False) -> list[dict[t.Literal["generated_text"], str]]: if t.TYPE_CHECKING: assert self.tokenizer is not None _generated_sequence, instruction_text = model_outputs["generated_sequence"][0], model_outputs["instruction_text"] @@ -64,16 +71,20 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr if response_key_token_id and end_key_token_id: # Find where "### Response:" is first found in the generated tokens. Considering this is part of the # prompt, we should definitely find it. We will return the tokens found after this token. - try: response_pos = sequence.index(response_key_token_id) - except ValueError: response_pos = None + try: + response_pos = sequence.index(response_key_token_id) + except ValueError: + response_pos = None if response_pos is None: logger.warning("Could not find response key %s in: %s", response_key_token_id, sequence) if response_pos: # Next find where "### End" is located. The model has been trained to end its responses with this # sequence (or actually, the token ID it maps to, since it is a special token). We may not find # this token, as the response could be truncated. If we don't find it then just return everything # to the end. Note that even though we set eos_token_id, we still see the this token at the end. - try: end_pos = sequence.index(end_key_token_id) - except ValueError: end_pos = None + try: + end_pos = sequence.index(end_key_token_id) + except ValueError: + end_pos = None decoded = self.tokenizer.decode(sequence[response_pos + 1:end_pos]).strip() if not decoded: # Otherwise we'll decode everything and use a regex to find the response and end. @@ -94,13 +105,19 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr if return_full_text: decoded = f"{instruction_text}\n{decoded}" records.append({"generated_text": t.cast(str, decoded)}) return records - return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline + return InstructionTextGenerationPipeline() if _init else InstructionTextGenerationPipeline class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedTokenizer"]): __openllm_internal__ = True + @property - def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {} - def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text) + def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: + return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {} + + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: + return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text) + def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]: llm_config = self.config.model_construct_env(**attrs) - with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config()) + with torch.inference_mode(): + return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config()) diff --git a/openllm-python/src/openllm/models/falcon/__init__.py b/openllm-python/src/openllm/models/falcon/__init__.py index 2e2ae8e5..2d314998 100644 --- a/openllm-python/src/openllm/models/falcon/__init__.py +++ b/openllm-python/src/openllm/models/falcon/__init__.py @@ -2,22 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_falcon import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING, - FalconConfig as FalconConfig, -) - +from openllm_core.config.configuration_falcon import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING, FalconConfig as FalconConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_falcon"] = ["Falcon"] if t.TYPE_CHECKING: from .modeling_falcon import Falcon as Falcon try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_falcon"] = ["VLLMFalcon"] if t.TYPE_CHECKING: from .modeling_vllm_falcon import VLLMFalcon as VLLMFalcon diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py index 533b027b..0d6faf0b 100644 --- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py +++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py @@ -2,15 +2,18 @@ from __future__ import annotations import typing as t, openllm if t.TYPE_CHECKING: import torch, transformers else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers") - class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]): __openllm_internal__ = True + @property - def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {} + def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: + return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {} + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): # type: ignore[attr-defined] return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True) + def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]: max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device) src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", openllm.StoppingCriteriaList([])) diff --git a/openllm-python/src/openllm/models/flan_t5/__init__.py b/openllm-python/src/openllm/models/flan_t5/__init__.py index 4d9a05a3..2a136a8e 100644 --- a/openllm-python/src/openllm/models/flan_t5/__init__.py +++ b/openllm-python/src/openllm/models/flan_t5/__init__.py @@ -2,28 +2,26 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available -from openllm_core.config.configuration_flan_t5 import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING, - FlanT5Config as FlanT5Config, -) - +from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING, FlanT5Config as FlanT5Config _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_flan_t5"] = ["FlanT5"] if t.TYPE_CHECKING: from .modeling_flan_t5 import FlanT5 as FlanT5 try: if not is_flax_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_flax_flan_t5"] = ["FlaxFlanT5"] if t.TYPE_CHECKING: from .modeling_flax_flan_t5 import FlaxFlanT5 as FlaxFlanT5 try: if not is_tf_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_tf_flan_t5"] = ["TFFlanT5"] if t.TYPE_CHECKING: from .modeling_tf_flan_t5 import TFFlanT5 as TFFlanT5 diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py index ee75d8e3..6936fbed 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -1,12 +1,14 @@ from __future__ import annotations import typing as t, openllm if t.TYPE_CHECKING: import transformers - class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformers.T5TokenizerFast"]): __openllm_internal__ = True + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch - with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + with torch.inference_mode(): + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: import torch, torch.nn.functional as F embeddings: list[list[float]] = [] diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py index 3c4ca703..f6661c9e 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py @@ -3,12 +3,13 @@ import typing as t, openllm from openllm_core._prompt import process_prompt from openllm_core.config.configuration_flan_t5 import DEFAULT_PROMPT_TEMPLATE if t.TYPE_CHECKING: import transformers - class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]): __openllm_internal__ = True + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: if decoder_start_token_id is None: decoder_start_token_id = 0 return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {} + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation. decoder_start_token_id = attrs.pop("decoder_start_token_id", 0) diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py index 1a542d69..43d656b5 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py @@ -1,7 +1,8 @@ from __future__ import annotations import typing as t, openllm if t.TYPE_CHECKING: import transformers - class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transformers.T5TokenizerFast"]): __openllm_internal__ = True - def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: + return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/gpt_neox/__init__.py b/openllm-python/src/openllm/models/gpt_neox/__init__.py index 7997dbd5..ac7f734d 100644 --- a/openllm-python/src/openllm/models/gpt_neox/__init__.py +++ b/openllm-python/src/openllm/models/gpt_neox/__init__.py @@ -2,22 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_gpt_neox import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING, - GPTNeoXConfig as GPTNeoXConfig, -) - +from openllm_core.config.configuration_gpt_neox import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING, GPTNeoXConfig as GPTNeoXConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_gpt_neox"] = ["GPTNeoX"] if t.TYPE_CHECKING: from .modeling_gpt_neox import GPTNeoX as GPTNeoX try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_gpt_neox"] = ["VLLMGPTNeoX"] if t.TYPE_CHECKING: from .modeling_vllm_gpt_neox import VLLMGPTNeoX as VLLMGPTNeoX diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py index 6016c325..03df3cc6 100644 --- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py +++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py @@ -5,15 +5,19 @@ if t.TYPE_CHECKING: import transformers logger = logging.getLogger(__name__) class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]): __openllm_internal__ = True + @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {} + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM: import transformers model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs) if self.config.use_half_precision: model.half() return model + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch - with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))) + with torch.inference_mode(): + return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))) diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py index 35e45015..fe34de4b 100644 --- a/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py +++ b/openllm-python/src/openllm/models/gpt_neox/modeling_vllm_gpt_neox.py @@ -1,7 +1,6 @@ from __future__ import annotations import typing as t, openllm if t.TYPE_CHECKING: import vllm, transformers - class VLLMGPTNeoX(openllm.LLM["vllm.LLMEngine", "transformers.GPTNeoXTokenizerFast"]): __openllm_internal__ = True tokenizer_id = "local" diff --git a/openllm-python/src/openllm/models/llama/__init__.py b/openllm-python/src/openllm/models/llama/__init__.py index 6f2f03a3..f8a1fa89 100644 --- a/openllm-python/src/openllm/models/llama/__init__.py +++ b/openllm-python/src/openllm/models/llama/__init__.py @@ -2,23 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_llama import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - PROMPT_MAPPING as PROMPT_MAPPING, - START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING, - LlamaConfig as LlamaConfig, -) - +from openllm_core.config.configuration_llama import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, PROMPT_MAPPING as PROMPT_MAPPING, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING, LlamaConfig as LlamaConfig _import_structure: dict[str, list[str]] = {} try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_llama"] = ["VLLMLlama"] if t.TYPE_CHECKING: from .modeling_vllm_llama import VLLMLlama as VLLMLlama try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_llama"] = ["Llama"] if t.TYPE_CHECKING: from .modeling_llama import Llama as Llama diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py index 24610581..f0ea9d42 100644 --- a/openllm-python/src/openllm/models/llama/modeling_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_llama.py @@ -3,10 +3,12 @@ import typing as t, openllm if t.TYPE_CHECKING: import transformers class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaTokenizerFast"]): __openllm_internal__ = True + @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {} + def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: import torch, torch.nn.functional as F encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device) diff --git a/openllm-python/src/openllm/models/mpt/__init__.py b/openllm-python/src/openllm/models/mpt/__init__.py index 004abd64..5b07e125 100644 --- a/openllm-python/src/openllm/models/mpt/__init__.py +++ b/openllm-python/src/openllm/models/mpt/__init__.py @@ -2,23 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_mpt import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - PROMPT_MAPPING as PROMPT_MAPPING, - START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING, - MPTConfig as MPTConfig, -) - +from openllm_core.config.configuration_mpt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, PROMPT_MAPPING as PROMPT_MAPPING, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING, MPTConfig as MPTConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_mpt"] = ["MPT"] if t.TYPE_CHECKING: from .modeling_mpt import MPT as MPT try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_mpt"] = ["VLLMMPT"] if t.TYPE_CHECKING: from .modeling_vllm_mpt import VLLMMPT as VLLMMPT diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py index 827c5e9f..4c044035 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py @@ -15,13 +15,16 @@ def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torc return config class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXTokenizerFast"]): __openllm_internal__ = True + def llm_post_init(self) -> None: import torch self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 + @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32}, {} + def import_model(self, *args: t.Any, trust_remote_code: bool = True, **attrs: t.Any) -> bentoml.Model: import torch, transformers _, tokenizer_attrs = self.llm_parameters @@ -32,8 +35,11 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, config=config, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, device_map=device_map, **attrs) - try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) - finally: torch.cuda.empty_cache() + try: + return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) + finally: + torch.cuda.empty_cache() + def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.PreTrainedModel: import transformers torch_dtype = attrs.pop("torch_dtype", self.dtype) @@ -43,6 +49,7 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs) model.tie_weights() return model + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch llm_config = self.config.model_construct_env(**attrs) @@ -52,5 +59,6 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken if torch.cuda.is_available(): with torch.autocast("cuda", torch.float16): # type: ignore[attr-defined] generated_tensors = self.model.generate(**inputs, **attrs) - else: generated_tensors = self.model.generate(**inputs, **attrs) + else: + generated_tensors = self.model.generate(**inputs, **attrs) return self.tokenizer.batch_decode(generated_tensors, skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/opt/__init__.py b/openllm-python/src/openllm/models/opt/__init__.py index 87cb0b2d..bb9bec5b 100644 --- a/openllm-python/src/openllm/models/opt/__init__.py +++ b/openllm-python/src/openllm/models/opt/__init__.py @@ -2,34 +2,33 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available -from openllm_core.config.configuration_opt import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING, - OPTConfig as OPTConfig, -) - +from openllm_core.config.configuration_opt import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING, OPTConfig as OPTConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_opt"] = ["OPT"] if t.TYPE_CHECKING: from .modeling_opt import OPT as OPT try: if not is_flax_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_flax_opt"] = ["FlaxOPT"] if t.TYPE_CHECKING: from .modeling_flax_opt import FlaxOPT as FlaxOPT try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_opt"] = ["VLLMOPT"] if t.TYPE_CHECKING: from .modeling_vllm_opt import VLLMOPT as VLLMOPT try: if not is_tf_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_tf_opt"] = ["TFOPT"] if t.TYPE_CHECKING: from .modeling_tf_opt import TFOPT as TFOPT diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py index 7858b42d..81c66d80 100644 --- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py @@ -9,9 +9,14 @@ else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transf logger = logging.getLogger(__name__) class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]): __openllm_internal__ = True + def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {} - def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True) + + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {} + + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py index 32b27713..601d038c 100644 --- a/openllm-python/src/openllm/models/opt/modeling_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_opt.py @@ -5,10 +5,13 @@ if t.TYPE_CHECKING: import transformers logger = logging.getLogger(__name__) class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]): __openllm_internal__ = True + @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {} + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch - with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + with torch.inference_mode(): + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py index 8ab0eaa7..0e66335e 100644 --- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py @@ -4,9 +4,12 @@ from openllm_core.utils import generate_labels if t.TYPE_CHECKING: import transformers class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tokenizer"]): __openllm_internal__ = True + def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: import transformers config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) - def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py index 5ca8c77d..f3b78975 100644 --- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py @@ -6,4 +6,6 @@ if t.TYPE_CHECKING: import vllm, transformers class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]): __openllm_internal__ = True tokenizer_id = "local" - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {} + + def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {} diff --git a/openllm-python/src/openllm/models/stablelm/__init__.py b/openllm-python/src/openllm/models/stablelm/__init__.py index 3f7f1258..41322e30 100644 --- a/openllm-python/src/openllm/models/stablelm/__init__.py +++ b/openllm-python/src/openllm/models/stablelm/__init__.py @@ -2,22 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_stablelm import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING, - StableLMConfig as StableLMConfig, -) - +from openllm_core.config.configuration_stablelm import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING, StableLMConfig as StableLMConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_stablelm"] = ["StableLM"] if t.TYPE_CHECKING: from .modeling_stablelm import StableLM as StableLM try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_stablelm"] = ["VLLMStableLM"] if t.TYPE_CHECKING: from .modeling_vllm_stablelm import VLLMStableLM as VLLMStableLM diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py index 77db1d0c..3e7f8e13 100644 --- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py +++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py @@ -3,13 +3,17 @@ import typing as t, openllm if t.TYPE_CHECKING: import transformers class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNeoXTokenizerFast"]): __openllm_internal__ = True + def llm_post_init(self) -> None: import torch self.bettertransformer = True if not torch.cuda.is_available() else False + @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {} + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch - with torch.inference_mode(): return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], skip_special_tokens=True)] + with torch.inference_mode(): + return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], skip_special_tokens=True)] diff --git a/openllm-python/src/openllm/models/starcoder/__init__.py b/openllm-python/src/openllm/models/starcoder/__init__.py index f6e68aff..c1c07f9f 100644 --- a/openllm-python/src/openllm/models/starcoder/__init__.py +++ b/openllm-python/src/openllm/models/starcoder/__init__.py @@ -2,22 +2,19 @@ from __future__ import annotations import sys, typing as t from openllm.exceptions import MissingDependencyError from openllm.utils import LazyModule, is_torch_available, is_vllm_available -from openllm_core.config.configuration_starcoder import ( - DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, - START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING, - StarCoderConfig as StarCoderConfig, -) - +from openllm_core.config.configuration_starcoder import DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING, StarCoderConfig as StarCoderConfig _import_structure: dict[str, list[str]] = {} try: if not is_torch_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_starcoder"] = ["StarCoder"] if t.TYPE_CHECKING: from .modeling_starcoder import StarCoder as StarCoder try: if not is_vllm_available(): raise MissingDependencyError -except MissingDependencyError: pass +except MissingDependencyError: + pass else: _import_structure["modeling_vllm_starcoder"] = ["VLLMStarCoder"] if t.TYPE_CHECKING: from .modeling_vllm_starcoder import VLLMStarCoder as VLLMStarCoder diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py index 83e7fc17..db789d15 100644 --- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py +++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py @@ -5,18 +5,23 @@ from openllm_core.config.configuration_starcoder import EOD, FIM_MIDDLE, FIM_PAD if t.TYPE_CHECKING: import transformers class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers.GPT2TokenizerFast"]): __openllm_internal__ = True + @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: import torch return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {} + def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: import torch, transformers torch_dtype, device_map = attrs.pop("torch_dtype", torch.float16), attrs.pop("device_map", "auto") tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.add_special_tokens({"additional_special_tokens": [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], "pad_token": EOD}) model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs) - try: return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) - finally: torch.cuda.empty_cache() + try: + return bentoml.transformers.save_model(self.tag, model, custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) + finally: + torch.cuda.empty_cache() + def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): @@ -26,6 +31,7 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers. # TODO: We will probably want to return the tokenizer here so that we can manually process this # return (skip_special_tokens=False, clean_up_tokenization_spaces=False)) return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) + def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]: max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device) src_len, stopping_criteria = encoded_inputs["input_ids"].shape[1], preprocess_generate_kwds.pop("stopping_criteria", openllm.StoppingCriteriaList([])) diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index 226f4c43..9f3ef617 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -30,12 +30,7 @@ from openllm_core._typing_compat import M, T, ParamSpec if t.TYPE_CHECKING: import bentoml - from . import ( - constants as constants, - ggml as ggml, - transformers as transformers, - ) - + from . import constants as constants, ggml as ggml, transformers as transformers P = ParamSpec("P") def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: """Load the tokenizer from BentoML store. @@ -49,11 +44,14 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: bentomodel_fs = fs.open_fs(llm._bentomodel.path) if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME): with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, "rb") as cofile: - try: tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] - except KeyError: raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save" - " the tokenizer within the model via 'custom_objects'." - " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None - else: tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath("/"), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs) + try: + tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] + except KeyError: + raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save" + " the tokenizer within the model via 'custom_objects'." + " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None + else: + tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath("/"), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs) if tokenizer.pad_token_id is None: if config.pad_token_id is not None: tokenizer.pad_token_id = config.pad_token_id @@ -61,10 +59,9 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: elif tokenizer.eos_token_id is not None: tokenizer.pad_token_id = tokenizer.eos_token_id else: tokenizer.add_special_tokens({"pad_token": "[PAD]"}) return tokenizer - class _Caller(t.Protocol[P]): - def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: ... - + def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: + ... _extras = ["get", "import_model", "save_pretrained", "load_model"] def _make_dispatch_function(fn: str) -> _Caller[P]: def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: @@ -75,17 +72,25 @@ def _make_dispatch_function(fn: str) -> _Caller[P]: > [!NOTE] See 'openllm.serialisation.ggml' if 'llm.runtime="ggml"' """ return getattr(importlib.import_module(f".{llm.runtime}", __name__), fn)(llm, *args, **kwargs) + return caller - if t.TYPE_CHECKING: - def get(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: ... - def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: ... - def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None: ... - def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M: ... + def get(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: + ... + + def import_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> bentoml.Model: + ... + + def save_pretrained(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> None: + ... + + def load_model(llm: openllm.LLM[M, T], *args: t.Any, **kwargs: t.Any) -> M: + ... _import_structure: dict[str, list[str]] = {"ggml": [], "transformers": [], "constants": []} __all__ = ["ggml", "transformers", "constants", "load_tokenizer", *_extras] -def __dir__() -> list[str]: return sorted(__all__) +def __dir__() -> list[str]: + return sorted(__all__) def __getattr__(name: str) -> t.Any: if name == "load_tokenizer": return load_tokenizer elif name in _import_structure: return importlib.import_module(f".{name}", __name__) diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py index 33aa6754..53ddadf7 100644 --- a/openllm-python/src/openllm/serialisation/ggml.py +++ b/openllm-python/src/openllm/serialisation/ggml.py @@ -9,8 +9,8 @@ import bentoml, openllm if t.TYPE_CHECKING: from openllm_core._typing_compat import M _conversion_strategy = {"pt": "ggml"} - -def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model: raise NotImplementedError("Currently work in progress.") +def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model: + raise NotImplementedError("Currently work in progress.") def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model: """Return an instance of ``bentoml.Model`` from given LLM instance. @@ -30,5 +30,7 @@ def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Mo if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__) raise -def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M: raise NotImplementedError("Currently work in progress.") -def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None: raise NotImplementedError("Currently work in progress.") +def load_model(llm: openllm.LLM[M, t.Any], *decls: t.Any, **attrs: t.Any) -> M: + raise NotImplementedError("Currently work in progress.") +def save_pretrained(llm: openllm.LLM[t.Any, t.Any], save_directory: str, **attrs: t.Any) -> None: + raise NotImplementedError("Currently work in progress.") diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 84d5d403..c95ef99f 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -7,19 +7,12 @@ from simple_di import Provide, inject from bentoml._internal.configuration.containers import BentoMLContainer from bentoml._internal.models.model import ModelOptions from .weights import HfIgnore -from ._helpers import ( - check_unintialised_params, - infer_autoclass_from_llm, - infer_tokenizers_from_llm, - make_model_signatures, - process_config, - update_model, -) +from ._helpers import check_unintialised_params, infer_autoclass_from_llm, infer_tokenizers_from_llm, make_model_signatures, process_config, update_model if t.TYPE_CHECKING: import types - import vllm, auto_gptq as autogptq, transformers ,torch + import vllm, auto_gptq as autogptq, transformers, torch import torch.nn from bentoml._internal.models import ModelStore @@ -33,7 +26,6 @@ else: logger = logging.getLogger(__name__) __all__ = ["import_model", "get", "load_model", "save_pretrained"] - @inject def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, _model_store: ModelStore = Provide[BentoMLContainer.model_store], **attrs: t.Any) -> bentoml.Model: """Auto detect model type from given model_id and import it to bentoml's model store. @@ -106,7 +98,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, else: # we will clone the all tings into the bentomodel path without loading model into memory snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm)) - except Exception: raise + except Exception: + raise else: bentomodel.flush() # type: ignore[no-untyped-call] bentomodel.save(_model_store) @@ -117,7 +110,6 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, # in the case where users first run openllm start without the model available locally. if openllm.utils.is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache() return bentomodel - def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: """Return an instance of ``bentoml.Model`` from given LLM instance. @@ -128,7 +120,8 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: """ try: model = bentoml.models.get(llm.tag) - if model.info.module not in ("openllm.serialisation.transformers" "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__): # NOTE: backward compatible with previous version of OpenLLM. + if model.info.module not in ("openllm.serialisation.transformers" + "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__): # NOTE: backward compatible with previous version of OpenLLM. raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.") if "runtime" in model.info.labels and model.info.labels["runtime"] != llm.runtime: raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.") @@ -136,7 +129,6 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: except bentoml.exceptions.NotFound as err: if auto_import: return import_model(llm, trust_remote_code=llm.__llm_trust_remote_code__) raise err from None - def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: """Load the model from BentoML store. @@ -156,7 +148,6 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer() if llm.__llm_implementation__ in {"pt", "vllm"}: check_unintialised_params(model) return t.cast("M", model) - def save_pretrained(llm: openllm.LLM[M, T], save_directory: str, is_main_process: bool = True, state_dict: DictStrAny | None = None, save_function: t.Any | None = None, push_to_hub: bool = False, max_shard_size: int | str = "10GB", safe_serialization: bool = False, variant: str | None = None, **attrs: t.Any) -> None: save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save)) model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs) diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index 9251c2e3..d1638572 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -8,10 +8,10 @@ if t.TYPE_CHECKING: from transformers.models.auto.auto_factory import _BaseAutoModelClass from bentoml._internal.models.model import ModelSignaturesType from openllm_core._typing_compat import DictStrAny, M, T -else: transformers, torch = openllm_core.utils.LazyLoader("transformers", globals(), "transformers"), openllm_core.utils.LazyLoader("torch", globals(), "torch") +else: + transformers, torch = openllm_core.utils.LazyLoader("transformers", globals(), "transformers"), openllm_core.utils.LazyLoader("torch", globals(), "torch") _object_setattr = object.__setattr__ - def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]: """A helper function that correctly parse config and attributes for transformers.PretrainedConfig. @@ -31,12 +31,10 @@ def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tu if copied_attrs.get("torch_dtype", None) == "auto": copied_attrs.pop("torch_dtype") config, attrs = transformers.AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **hub_attrs, **copied_attrs) return config, hub_attrs, attrs - def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T: __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config["tokenizer_class"], default="AutoTokenizer"), None) if __cls is None: raise ValueError(f"Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`") return __cls - def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass: if llm.config["trust_remote_code"]: autoclass = "AutoModelForSeq2SeqLM" if llm.config["model_type"] == "seq2seq_lm" else "AutoModelForCausalLM" @@ -50,22 +48,16 @@ def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.Pretra elif type(config) in transformers.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING: idx = 1 else: raise openllm.exceptions.OpenLLMException(f"Model type {type(config)} is not supported yet.") return getattr(transformers, FRAMEWORK_TO_AUTOCLASS_MAPPING[llm.__llm_implementation__][idx]) - def check_unintialised_params(model: torch.nn.Module) -> None: unintialized = [n for n, param in model.named_parameters() if param.data.device == torch.device("meta")] if len(unintialized) > 0: raise RuntimeError(f"Found the following unintialized parameters in {model}: {unintialized}") - def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model: based: DictStrAny = copy.deepcopy(bentomodel.info.metadata) based.update(metadata) - _object_setattr( - bentomodel, "_info", - ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged - tag=bentomodel.info.tag, module=bentomodel.info.module, labels=bentomodel.info.labels, options=bentomodel.info.options.to_dict(), signatures=bentomodel.info.signatures, context=bentomodel.info.context, api_version=bentomodel.info.api_version, creation_time=bentomodel.info.creation_time, metadata=based - ) - ) + _object_setattr(bentomodel, "_info", ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged + tag=bentomodel.info.tag, module=bentomodel.info.module, labels=bentomodel.info.labels, options=bentomodel.info.options.to_dict(), signatures=bentomodel.info.signatures, context=bentomodel.info.context, api_version=bentomodel.info.api_version, creation_time=bentomodel.info.creation_time, metadata=based + )) return bentomodel - # NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType: infer_fn: tuple[str, ...] = ("__call__",) diff --git a/openllm-python/src/openllm/serialisation/transformers/weights.py b/openllm-python/src/openllm/serialisation/transformers/weights.py index f6017893..984e211b 100644 --- a/openllm-python/src/openllm/serialisation/transformers/weights.py +++ b/openllm-python/src/openllm/serialisation/transformers/weights.py @@ -4,14 +4,15 @@ from huggingface_hub import HfApi if t.TYPE_CHECKING: import openllm from openllm_core._typing_compat import M, T - -def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool: return any(s.rfilename.endswith(".safetensors") for s in HfApi().model_info(model_id, revision=revision).siblings) +def has_safetensors_weights(model_id: str, revision: str | None = None) -> bool: + return any(s.rfilename.endswith(".safetensors") for s in HfApi().model_info(model_id, revision=revision).siblings) @attr.define(slots=True) class HfIgnore: safetensors = "*.safetensors" pt = "*.bin" tf = "*.h5" flax = "*.msgpack" + @classmethod def ignore_patterns(cls, llm: openllm.LLM[M, T]) -> list[str]: if llm.__llm_implementation__ == "vllm": base = [cls.tf, cls.flax, cls.safetensors] diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index 414baea8..dcdc918c 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -5,18 +5,13 @@ we won't ensure backward compatibility for these functions. So use with caution. """ from __future__ import annotations import typing as t, openllm_core -from . import ( - dummy_flax_objects as dummy_flax_objects, - dummy_pt_objects as dummy_pt_objects, - dummy_tf_objects as dummy_tf_objects, - dummy_vllm_objects as dummy_vllm_objects, -) +from . import (dummy_flax_objects as dummy_flax_objects, dummy_pt_objects as dummy_pt_objects, dummy_tf_objects as dummy_tf_objects, dummy_vllm_objects as dummy_vllm_objects,) if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralRuntime import openllm - -def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: return {"runtime": llm.runtime, "framework": "openllm", "model_name": llm.config["model_name"], "architecture": llm.config["architecture"], "serialisation_format": llm._serialisation_format} +def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: + return {"runtime": llm.runtime, "framework": "openllm", "model_name": llm.config["model_name"], "architecture": llm.config["architecture"], "serialisation_format": llm._serialisation_format} def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: import openllm if implementation == "tf": return openllm.AutoTFLLM @@ -24,9 +19,9 @@ def infer_auto_class(implementation: LiteralRuntime) -> type[openllm.AutoLLM | o elif implementation == "pt": return openllm.AutoLLM elif implementation == "vllm": return openllm.AutoVLLM else: raise RuntimeError(f"Unknown implementation: {implementation} (supported: 'pt', 'flax', 'tf', 'vllm')") - __all__ = ["generate_labels", "infer_auto_class", "dummy_flax_objects", "dummy_pt_objects", "dummy_tf_objects", "dummy_vllm_objects"] -def __dir__() -> t.Sequence[str]: return sorted(__all__) +def __dir__() -> t.Sequence[str]: + return sorted(__all__) def __getattr__(it: str) -> t.Any: if hasattr(openllm_core.utils, it): return getattr(openllm_core.utils, it) else: raise AttributeError(f"module {__name__} has no attribute {it}") diff --git a/openllm-python/tests/__init__.py b/openllm-python/tests/__init__.py index dd602334..7c7411f2 100644 --- a/openllm-python/tests/__init__.py +++ b/openllm-python/tests/__init__.py @@ -2,7 +2,6 @@ from __future__ import annotations import os from hypothesis import HealthCheck, settings - settings.register_profile("CI", settings(suppress_health_check=[HealthCheck.too_slow]), deadline=None) if "CI" in os.environ: settings.load_profile("CI") diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py index 860e5e7f..e0fdf576 100644 --- a/openllm-python/tests/_strategies/_configuration.py +++ b/openllm-python/tests/_strategies/_configuration.py @@ -2,22 +2,28 @@ from __future__ import annotations import logging, typing as t, openllm from openllm_core._configuration import ModelSettings from hypothesis import strategies as st - logger = logging.getLogger(__name__) env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_name in openllm.CONFIG_MAPPING.keys()]) - @st.composite def model_settings(draw: st.DrawFn): """Strategy for generating ModelSettings objects.""" kwargs: dict[str, t.Any] = { - "default_id": st.text(min_size=1), "model_ids": st.lists(st.text(), min_size=1), "architecture": st.text(min_size=1), "url": st.text(), "requires_gpu": st.booleans(), "trust_remote_code": st.booleans(), "requirements": st.none() - | st.lists(st.text(), min_size=1), "default_implementation": st.dictionaries(st.sampled_from(["cpu", "nvidia.com/gpu"]), st.sampled_from(["vllm", "pt", "tf", "flax"])), "model_type": st.sampled_from(["causal_lm", "seq2seq_lm"]), "runtime": st.sampled_from(["transformers", "ggml"]), "name_type": st.sampled_from(["dasherize", "lowercase"]), "timeout": st.integers( - min_value=3600 - ), "workers_per_resource": st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)), + "default_id": st.text(min_size=1), + "model_ids": st.lists(st.text(), min_size=1), + "architecture": st.text(min_size=1), + "url": st.text(), + "requires_gpu": st.booleans(), + "trust_remote_code": st.booleans(), + "requirements": st.none() | st.lists(st.text(), min_size=1), + "default_implementation": st.dictionaries(st.sampled_from(["cpu", "nvidia.com/gpu"]), st.sampled_from(["vllm", "pt", "tf", "flax"])), + "model_type": st.sampled_from(["causal_lm", "seq2seq_lm"]), + "runtime": st.sampled_from(["transformers", "ggml"]), + "name_type": st.sampled_from(["dasherize", "lowercase"]), + "timeout": st.integers(min_value=3600), + "workers_per_resource": st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)), } return draw(st.builds(ModelSettings, **kwargs)) - def make_llm_config(cls_name: str, dunder_config: dict[str, t.Any] | ModelSettings, fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None, generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None,) -> type[openllm.LLMConfig]: globs: dict[str, t.Any] = {"openllm": openllm} _config_args: list[str] = [] diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py index f7d8e7d6..7eba9618 100644 --- a/openllm-python/tests/configuration_test.py +++ b/openllm-python/tests/configuration_test.py @@ -4,7 +4,6 @@ from unittest import mock from openllm_core._configuration import GenerationConfig, ModelSettings, field_env_key from hypothesis import assume, given, strategies as st from ._strategies._configuration import make_llm_config, model_settings - # XXX: @aarnphm fixes TypedDict behaviour in 3.11 @pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason="TypedDict in 3.11 behaves differently, so we need to fix this") def test_missing_default(): @@ -14,7 +13,6 @@ def test_missing_default(): make_llm_config("MissingModelId", {"default_id": "huggingface/t5-tiny-testing", "requirements": ["bentoml"]}) with pytest.raises(ValueError, match="Missing required fields *"): make_llm_config("MissingArchitecture", {"default_id": "huggingface/t5-tiny-testing", "model_ids": ["huggingface/t5-tiny-testing"], "requirements": ["bentoml"],},) - def test_forbidden_access(): cl_ = make_llm_config("ForbiddenAccess", {"default_id": "huggingface/t5-tiny-testing", "model_ids": ["huggingface/t5-tiny-testing", "bentoml/t5-tiny-testing"], "architecture": "PreTrainedModel", "requirements": ["bentoml"],},) @@ -22,7 +20,6 @@ def test_forbidden_access(): assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "GenerationConfig",) assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "SamplingParams",) assert openllm.utils.lenient_issubclass(cl_.__openllm_generation_class__, GenerationConfig) - @given(model_settings()) def test_class_normal_gen(gen_settings: ModelSettings): assume(gen_settings["default_id"] and all(i for i in gen_settings["model_ids"])) @@ -30,23 +27,19 @@ def test_class_normal_gen(gen_settings: ModelSettings): assert issubclass(cl_, openllm.LLMConfig) for key in gen_settings: assert object.__getattribute__(cl_, f"__openllm_{key}__") == gen_settings.__getitem__(key) - @given(model_settings(), st.integers()) def test_simple_struct_dump(gen_settings: ModelSettings, field1: int): cl_ = make_llm_config("IdempotentLLM", gen_settings, fields=(("field1", "float", field1),)) assert cl_().model_dump()["field1"] == field1 - @given(model_settings(), st.integers()) def test_config_derivation(gen_settings: ModelSettings, field1: int): cl_ = make_llm_config("IdempotentLLM", gen_settings, fields=(("field1", "float", field1),)) new_cls = cl_.model_derivate("DerivedLLM", default_id="asdfasdf") assert new_cls.__openllm_default_id__ == "asdfasdf" - @given(model_settings()) def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings): cl_ = make_llm_config("AttrsProtocolLLM", gen_settings) assert attr.has(cl_) - @given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0),) def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float): cl_ = make_llm_config("ComplexLLM", gen_settings, fields=(("field1", "float", field1),), generation_fields=(("temperature", temperature),),) @@ -65,12 +58,10 @@ def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperatu pas_nested = cl_(generation_config={"temperature": input_temperature}, field1=input_field1) assert pas_nested.model_dump()["field1"] == input_field1 assert pas_nested.model_dump()["generation_config"]["temperature"] == input_temperature - @contextlib.contextmanager def patch_env(**attrs: t.Any): with mock.patch.dict(os.environ, attrs, clear=True): yield - def test_struct_envvar(): with patch_env(**{field_env_key("env_llm", "field1"): "4", field_env_key("env_llm", "temperature", suffix="generation"): "0.2",}): @@ -88,7 +79,6 @@ def test_struct_envvar(): overwrite_default = EnvLLM() assert overwrite_default.field1 == 4 assert overwrite_default["temperature"] == 0.2 - def test_struct_provided_fields(): class EnvLLM(openllm.LLMConfig): __config__ = {"default_id": "asdfasdf", "model_ids": ["asdf", "asdfasdfads"], "architecture": "PreTrainedModel",} @@ -100,7 +90,6 @@ def test_struct_provided_fields(): sent = EnvLLM.model_construct_env(field1=20, temperature=0.4) assert sent.field1 == 20 assert sent.generation_config.temperature == 0.4 - def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mk: mk.setenv(field_env_key("overwrite_with_env_available", "field1"), str(4.0)) @@ -108,13 +97,11 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat sent = make_llm_config("OverwriteWithEnvAvailable", {"default_id": "asdfasdf", "model_ids": ["asdf", "asdfasdfads"], "architecture": "PreTrainedModel"}, fields=(("field1", "float", 3.0),),).model_construct_env(field1=20.0, temperature=0.4) assert sent.generation_config.temperature == 0.4 assert sent.field1 == 20.0 - @given(model_settings()) @pytest.mark.parametrize(("return_dict", "typ"), [(True, dict), (False, transformers.GenerationConfig)]) def test_conversion_to_transformers(return_dict: bool, typ: type[t.Any], gen_settings: ModelSettings): cl_ = make_llm_config("ConversionLLM", gen_settings) assert isinstance(cl_().to_generation_config(return_as_dict=return_dict), typ) - @given(model_settings()) def test_click_conversion(gen_settings: ModelSettings): # currently our conversion omit Union type. @@ -126,7 +113,6 @@ def test_click_conversion(gen_settings: ModelSettings): filtered = {k for k, v in cl_.__openllm_hints__.items() if t.get_origin(v) is not t.Union} click_options_filtered = [i for i in wrapped.__click_params__ if i.name and not i.name.startswith("fake_")] assert len(filtered) == len(click_options_filtered) - @pytest.mark.parametrize("model_name", openllm.CONFIG_MAPPING.keys()) def test_configuration_dict_protocol(model_name: str): config = openllm.AutoConfig.for_model(model_name) diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py index e5106f9e..b0333ec3 100644 --- a/openllm-python/tests/conftest.py +++ b/openllm-python/tests/conftest.py @@ -13,7 +13,6 @@ def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunn for framework, prompt in itertools.product(runtime_impl, _PROMPT_MAPPING.keys()): llm = openllm.Runner(model, model_id=_FRAMEWORK_MAPPING[model], ensure_available=True, implementation=framework, init_local=True,) yield prompt, llm - def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: if os.getenv("GITHUB_ACTIONS") is None: if "prompt" in metafunc.fixturenames and "llm" in metafunc.fixturenames: diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py index 3f55000c..a2c31a48 100644 --- a/openllm-python/tests/models/conftest.py +++ b/openllm-python/tests/models/conftest.py @@ -5,7 +5,6 @@ import attr, docker, docker.errors, docker.types, orjson, pytest, openllm from syrupy.extensions.json import JSONSnapshotExtension from openllm._llm import normalise_model_name from openllm_core._typing_compat import DictStrAny, ListAny - logger = logging.getLogger(__name__) if t.TYPE_CHECKING: @@ -14,7 +13,6 @@ if t.TYPE_CHECKING: from syrupy.types import PropertyFilter, PropertyMatcher, SerializableData, SerializedData from openllm._configuration import GenerationConfig from openllm.client import BaseAsyncClient - class ResponseComparator(JSONSnapshotExtension): def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData: if openllm.utils.LazyType(ListAny).isinstance(data): @@ -52,11 +50,9 @@ class ResponseComparator(JSONSnapshotExtension): return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config)) return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]) - @pytest.fixture() def response_snapshot(snapshot: SnapshotAssertion): return snapshot.use_extension(ResponseComparator) - @attr.define(init=False) class _Handle(ABC): port: int @@ -88,7 +84,6 @@ class _Handle(ABC): except Exception: time.sleep(1) raise RuntimeError(f"Handle failed to initialise within {timeout} seconds.") - @attr.define(init=False) class LocalHandle(_Handle): process: subprocess.Popen[bytes] @@ -98,12 +93,10 @@ class LocalHandle(_Handle): def status(self) -> bool: return self.process.poll() is None - class HandleProtocol(t.Protocol): @contextlib.contextmanager def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]: ... - @attr.define(init=False) class DockerHandle(_Handle): container_name: str @@ -115,7 +108,6 @@ class DockerHandle(_Handle): def status(self) -> bool: container = self.docker_client.containers.get(self.container_name) return container.status in ["running", "created"] - @contextlib.contextmanager def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal["container", "local"], quantize: t.Literal["int8", "int4", "gptq"] | None = None, *, _serve_grpc: bool = False,): with openllm.utils.reserve_free_port() as port: @@ -136,7 +128,6 @@ def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t. proc.stdout.close() if proc.stderr: proc.stderr.close() - @contextlib.contextmanager def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal["container", "local"], quantize: t.Literal["int8", "int4", "gptq"] | None = None, *, _serve_grpc: bool = False,): envvar = openllm.utils.EnvVarMixin(model) @@ -177,23 +168,19 @@ def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode print(container_output, file=sys.stderr) container.remove() - @pytest.fixture(scope="session", autouse=True) def clean_context() -> t.Generator[contextlib.ExitStack, None, None]: stack = contextlib.ExitStack() yield stack stack.close() - @pytest.fixture(scope="module") def el() -> t.Generator[asyncio.AbstractEventLoop, None, None]: loop = asyncio.get_event_loop() yield loop loop.close() - @pytest.fixture(params=["container", "local"], scope="session") def deployment_mode(request: pytest.FixtureRequest) -> str: return request.param - @pytest.fixture(scope="module") def handler(el: asyncio.AbstractEventLoop, deployment_mode: t.Literal["container", "local"]): if deployment_mode == "container": diff --git a/openllm-python/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py index 189fc79f..6fd9bd42 100644 --- a/openllm-python/tests/models/flan_t5_test.py +++ b/openllm-python/tests/models/flan_t5_test.py @@ -9,21 +9,17 @@ if t.TYPE_CHECKING: import contextlib from .conftest import HandleProtocol, ResponseComparator, _Handle - model = "flan_t5" model_id = "google/flan-t5-small" - @pytest.fixture(scope="module") def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal["container", "local"], clean_context: contextlib.ExitStack,): with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag: with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: yield handle - @pytest.fixture(scope="module") async def flan_t5(flan_t5_handle: _Handle): await flan_t5_handle.health(240) return flan_t5_handle.client - @pytest.mark.asyncio() async def test_flan_t5(flan_t5: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator): client = await flan_t5 diff --git a/openllm-python/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py index 49c4101e..f08ce291 100644 --- a/openllm-python/tests/models/opt_test.py +++ b/openllm-python/tests/models/opt_test.py @@ -9,21 +9,17 @@ if t.TYPE_CHECKING: import contextlib from .conftest import HandleProtocol, ResponseComparator, _Handle - model = "opt" model_id = "facebook/opt-125m" - @pytest.fixture(scope="module") def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal["container", "local"], clean_context: contextlib.ExitStack,): with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag: with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: yield handle - @pytest.fixture(scope="module") async def opt_125m(opt_125m_handle: _Handle): await opt_125m_handle.health(240) return opt_125m_handle.client - @pytest.mark.asyncio() async def test_opt_125m(opt_125m: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator): client = await opt_125m diff --git a/openllm-python/tests/models_test.py b/openllm-python/tests/models_test.py index d0b35219..4720ef87 100644 --- a/openllm-python/tests/models_test.py +++ b/openllm-python/tests/models_test.py @@ -2,19 +2,16 @@ from __future__ import annotations import os, typing as t, pytest if t.TYPE_CHECKING: import openllm - @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") is not None, reason="Model is too large for CI") def test_flan_t5_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]): assert llm(prompt) assert llm(prompt, temperature=0.8, top_p=0.23) - @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") is not None, reason="Model is too large for CI") def test_opt_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]): assert llm(prompt) assert llm(prompt, temperature=0.9, top_k=8) - @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") is not None, reason="Model is too large for CI") def test_baichuan_implementation(prompt: str, llm: openllm.LLM[t.Any, t.Any]): assert llm(prompt) diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py index a50a8b13..69c57916 100644 --- a/openllm-python/tests/package_test.py +++ b/openllm-python/tests/package_test.py @@ -6,7 +6,6 @@ if t.TYPE_CHECKING: from pathlib import Path HF_INTERNAL_T5_TESTING = "hf-internal-testing/tiny-random-t5" actions_xfail = functools.partial(pytest.mark.xfail, condition=os.getenv("GITHUB_ACTIONS") is not None, reason="Marking GitHub Actions to xfail due to flakiness and building environment not isolated.",) - @actions_xfail def test_general_build_with_internal_testing(): bento_store = BentoMLContainer.bento_store.get() @@ -19,7 +18,6 @@ def test_general_build_with_internal_testing(): bento = openllm.build("flan-t5", model_id=HF_INTERNAL_T5_TESTING) assert len(bento_store.list(bento.tag)) == 1 - @actions_xfail def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): local_path = tmp_path_factory.mktemp("local_t5") @@ -31,13 +29,11 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): llm.save_pretrained(local_path) assert openllm.build("flan-t5", model_id=local_path.resolve().__fspath__(), model_version="local") - @pytest.fixture() def dockerfile_template(tmp_path_factory: pytest.TempPathFactory): file = tmp_path_factory.mktemp("dockerfiles") / "Dockerfile.template" file.write_text("{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}") return file - @pytest.mark.usefixtures("dockerfile_template") @actions_xfail def test_build_with_custom_dockerfile(dockerfile_template: Path): diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py index 162dbde4..610a6712 100644 --- a/openllm-python/tests/strategies_test.py +++ b/openllm-python/tests/strategies_test.py @@ -3,7 +3,6 @@ import os, typing as t, pytest, bentoml from openllm_core import _strategies as strategy from openllm_core._strategies import CascadingResourceStrategy, NvidiaGpuResource, get_resource if t.TYPE_CHECKING: from _pytest.monkeypatch import MonkeyPatch - def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv("CUDA_VISIBLE_DEVICES", "0,1") @@ -11,7 +10,6 @@ def test_nvidia_gpu_resource_from_env(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 2 assert resource == ["0", "1"] mcls.delenv("CUDA_VISIBLE_DEVICES") - def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv("CUDA_VISIBLE_DEVICES", "0,2,-1,1") @@ -19,7 +17,6 @@ def test_nvidia_gpu_cutoff_minus(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 2 assert resource == ["0", "2"] mcls.delenv("CUDA_VISIBLE_DEVICES") - def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv("CUDA_VISIBLE_DEVICES", "-1") @@ -27,7 +24,6 @@ def test_nvidia_gpu_neg_val(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 0 assert resource == [] mcls.delenv("CUDA_VISIBLE_DEVICES") - def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: mcls.setenv("CUDA_VISIBLE_DEVICES", "GPU-5ebe9f43-ac33420d4628") @@ -53,7 +49,6 @@ def test_nvidia_gpu_parse_literal(monkeypatch: pytest.MonkeyPatch): assert len(resource) == 1 assert resource == ["MIG-GPU-5ebe9f43-ac33420d4628"] mcls.delenv("CUDA_VISIBLE_DEVICES") - @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") is not None, reason="skip GPUs test on CI") def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: @@ -64,7 +59,6 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch): assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match("Input list should be all string type.") assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match("Input list should be all string type.") assert pytest.raises(ValueError, NvidiaGpuResource.validate, ["GPU-5ebe9f43", "GPU-ac33420d4628"]).match("Failed to parse available GPUs UUID") - def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: # to make this tests works with system that has GPU @@ -91,13 +85,10 @@ def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch): NvidiaGpuResource.from_spec(1.5) with pytest.raises(ValueError): assert NvidiaGpuResource.from_spec(-2) - class GPURunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu") - def unvalidated_get_resource(x: dict[str, t.Any], y: str, validate: bool = False): return get_resource(x, y, validate=validate) - @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"]) def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource) @@ -108,7 +99,6 @@ def test_cascade_strategy_worker_count(monkeypatch: MonkeyPatch, gpu_type: str): assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 9]}, 0.5) == 1 assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 7, 8, 9]}, 0.5) == 1 assert CascadingResourceStrategy.get_worker_count(GPURunnable, {gpu_type: [2, 5, 7, 8, 9]}, 0.4) == 1 - @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"]) def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource) @@ -147,7 +137,6 @@ def test_cascade_strategy_worker_env(monkeypatch: MonkeyPatch, gpu_type: str): assert envs.get("CUDA_VISIBLE_DEVICES") == "7,8" envs = CascadingResourceStrategy.get_worker_env(GPURunnable, {gpu_type: [2, 6, 7, 8, 9]}, 0.4, 2) assert envs.get("CUDA_VISIBLE_DEVICES") == "9" - @pytest.mark.parametrize("gpu_type", ["nvidia.com/gpu", "amd.com/gpu"]) def test_cascade_strategy_disabled_via_env(monkeypatch: MonkeyPatch, gpu_type: str): monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource) diff --git a/pyproject.toml b/pyproject.toml index a7e9b5f4..922f86ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -246,6 +246,13 @@ avoid-escape = false "openllm-client/src/openllm_client/bentoclient/_http.py" = ["PERF203"] "typings/**" = ["F", "E"] +[tool.yapfignore] +ignore_patterns = [ + "openllm-python/src/openllm/playground", + "openllm-python/src/openllm/utils/dummy_*.py", + "openllm-python/src/openllm/models/__init__.py" +] + [tool.yapf] ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true ALLOW_MULTILINE_DICTIONARY_KEYS = false @@ -253,41 +260,46 @@ ALLOW_MULTILINE_LAMBDAS = false ALLOW_SPLIT_BEFORE_DEFAULT_OR_NAMED_ASSIGNS = false ALLOW_SPLIT_BEFORE_DICT_VALUE = false ARITHMETIC_PRECEDENCE_INDICATION = true -BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1 -BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1 +BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 0 +BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 0 BLANK_LINE_BEFORE_CLASS_DOCSTRING = false BLANK_LINE_BEFORE_MODULE_DOCSTRING = false BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false COALESCE_BRACKETS = true -COLUMN_LIMIT = 768 +COLUMN_LIMIT = 512 CONTINUATION_ALIGN_STYLE = "VALIGN-RIGHT" DEDENT_CLOSING_BRACKETS = true DISABLE_ENDING_COMMA_HEURISTIC = true -EACH_DICT_ENTRY_ON_SEPARATE_LINE = false +EACH_DICT_ENTRY_ON_SEPARATE_LINE = true INDENT_BLANK_LINES = false INDENT_CLOSING_BRACKETS = false INDENT_WIDTH = 2 JOIN_MULTIPLE_LINES = true NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true SPACES_AROUND_SUBSCRIPT_COLON = false +SPACES_AROUND_DICT_DELIMITERS = false +SPACES_AROUND_LIST_DELIMITERS = false +SPACES_AROUND_POWER_OPERATOR = false +SPACES_AROUND_TUPLE_DELIMITERS = false SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false SPACE_INSIDE_BRACKETS = false SPLIT_ALL_COMMA_SEPARATED_VALUES = false -SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = false +SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = true SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED = false SPLIT_BEFORE_BITWISE_OPERATOR = false SPLIT_BEFORE_CLOSING_BRACKET = false SPLIT_BEFORE_DICT_SET_GENERATOR = false -SPLIT_BEFORE_DOT = false +SPLIT_BEFORE_DOT = true # similar to how rust format its expression SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = false SPLIT_BEFORE_FIRST_ARGUMENT = false SPLIT_BEFORE_LOGICAL_OPERATOR = false SPLIT_BEFORE_NAMED_ASSIGNS = false -SPLIT_COMPLEX_COMPREHENSION = false -SPLIT_PENALTY_AFTER_OPENING_BRACKET = 10000 +SPLIT_COMPLEX_COMPREHENSION = true +SPLIT_PENALTY_AFTER_OPENING_BRACKET = 350 SPLIT_PENALTY_BEFORE_IF_EXPR = 10000 -SPLIT_PENALTY_COMPREHENSION = 3000 -SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 8000 +SPLIT_PENALTY_COMPREHENSION = 2500 +SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 5000 +USE_TABS = false [tool.pytest.ini_options] addopts = ["-rfEX", "-pno:warnings", "--snapshot-warn-unused"] @@ -397,10 +409,8 @@ pretty = true python_version = "3.8" show_error_codes = true strict = true -warn_no_return = false warn_return_any = false warn_unreachable = true -warn_unused_ignores = false [[tool.mypy.overrides]] ignore_missing_imports = true module = [ diff --git a/tools/yapf b/tools/yapf index 84a198c2..2389ae42 100755 --- a/tools/yapf +++ b/tools/yapf @@ -6,4 +6,6 @@ exit 1 ) -yapf -pdrm 2> /dev/null || exit 0 +yapf -pri openllm-python/** 2>/dev/null +yapf -pri openllm-core/** 2>/dev/null +yapf -pri openllm-client/** 2>/dev/null