diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py index 985f75d2..1d1daae3 100644 --- a/openllm-client/src/openllm_client/benmin/_grpc.py +++ b/openllm-client/src/openllm_client/benmin/_grpc.py @@ -20,12 +20,36 @@ class ClientCredentials(t.TypedDict): private_key: NotRequired[t.Union[bytes, str]] certificate_chain: NotRequired[t.Union[bytes, str]] @overload -def dispatch_channel(server_url: str, typ: t.Literal["async"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel: +def dispatch_channel( + server_url: str, + typ: t.Literal["async"], + ssl: bool = ..., + ssl_client_credentials: ClientCredentials | None = ..., + options: t.Any | None = ..., + compression: grpc.Compression | None = ..., + interceptors: t.Sequence[aio.ClientInterceptor] | None = ... +) -> aio.Channel: ... @overload -def dispatch_channel(server_url: str, typ: t.Literal["sync"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel: +def dispatch_channel( + server_url: str, + typ: t.Literal["sync"], + ssl: bool = ..., + ssl_client_credentials: ClientCredentials | None = ..., + options: t.Any | None = ..., + compression: grpc.Compression | None = ..., + interceptors: t.Sequence[aio.ClientInterceptor] | None = None +) -> grpc.Channel: ... -def dispatch_channel(server_url: str, typ: t.Literal["async", "sync"] = "sync", ssl: bool = False, ssl_client_credentials: ClientCredentials | None = None, options: t.Any | None = None, compression: grpc.Compression | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> aio.Channel | grpc.Channel: +def dispatch_channel( + server_url: str, + typ: t.Literal["async", "sync"] = "sync", + ssl: bool = False, + ssl_client_credentials: ClientCredentials | None = None, + options: t.Any | None = None, + compression: grpc.Compression | None = None, + interceptors: t.Sequence[aio.ClientInterceptor] | None = None +) -> aio.Channel | grpc.Channel: credentials = None if ssl: if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'") @@ -42,8 +66,16 @@ class GrpcClient(Client): options: t.Any compression: t.Optional[grpc.Compression] - def __init__(self, server_url: str, svc: bentoml.Service, # gRPC specific options - ssl: bool = False, options: t.Any | None = None, compression: grpc.Compression | None = None, ssl_client_credentials: ClientCredentials | None = None, **kwargs: t.Any) -> None: + def __init__( + self, + server_url: str, + svc: bentoml.Service, # gRPC specific options + ssl: bool = False, + options: t.Any | None = None, + compression: grpc.Compression | None = None, + ssl_client_credentials: ClientCredentials | None = None, + **kwargs: t.Any + ) -> None: self.ssl, self.ssl_client_credentials, self.options, self.compression = ssl, ssl_client_credentials, options, compression super().__init__(server_url, svc, **kwargs) @@ -57,7 +89,14 @@ class GrpcClient(Client): @staticmethod def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: - with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: + with dispatch_channel( + f"{host.replace(r'localhost', '0.0.0.0')}:{port}", + typ="sync", + options=kwargs.get("options", None), + compression=kwargs.get("compression", None), + ssl=kwargs.get("ssl", False), + ssl_client_credentials=kwargs.get("ssl_client_credentials", None) + ) as channel: req = pb_health.HealthCheckRequest() req.service = "bentoml.grpc.v1.BentoService" health_stub = services_health.HealthStub(channel) @@ -80,12 +119,34 @@ class GrpcClient(Client): @classmethod def from_url(cls, url: str, **kwargs: t.Any) -> GrpcClient: - with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: - metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest())) + with dispatch_channel( + url.replace(r"localhost", "0.0.0.0"), + typ="sync", + options=kwargs.get("options", None), + compression=kwargs.get("compression", None), + ssl=kwargs.get("ssl", False), + ssl_client_credentials=kwargs.get("ssl_client_credentials", None) + ) as channel: + metadata = t.cast( + "ServiceMetadataResponse", + channel.unary_unary( + "/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString + )(pb.ServiceMetadataRequest()) + ) reflection = bentoml.Service(metadata.name) for api in metadata.apis: try: - reflection.apis[api.name] = InferenceAPI[t.Any](None, bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}), bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}), name=api.name, doc=api.docs) + reflection.apis[api.name] = InferenceAPI[t.Any]( + None, + bentoml.io.from_spec({ + "id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None) + }), + bentoml.io.from_spec({ + "id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None) + }), + name=api.name, + doc=api.docs + ) except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e) return cls(url, reflection, **kwargs) @@ -111,8 +172,17 @@ class AsyncGrpcClient(AsyncClient): interceptors: t.Optional[t.Sequence[aio.ClientInterceptor]] compression: t.Optional[grpc.Compression] - def __init__(self, server_url: str, svc: bentoml.Service, # gRPC specific options - ssl: bool = False, options: aio.ChannelArgumentType | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None, compression: grpc.Compression | None = None, ssl_client_credentials: ClientCredentials | None = None, **kwargs: t.Any) -> None: + def __init__( + self, + server_url: str, + svc: bentoml.Service, # gRPC specific options + ssl: bool = False, + options: aio.ChannelArgumentType | None = None, + interceptors: t.Sequence[aio.ClientInterceptor] | None = None, + compression: grpc.Compression | None = None, + ssl_client_credentials: ClientCredentials | None = None, + **kwargs: t.Any + ) -> None: self.ssl, self.ssl_client_credentials, self.options, self.interceptors, self.compression = ssl, ssl_client_credentials, options, interceptors, compression super().__init__(server_url, svc, **kwargs) @@ -126,7 +196,14 @@ class AsyncGrpcClient(AsyncClient): @staticmethod async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: - async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: + async with dispatch_channel( + f"{host.replace(r'localhost', '0.0.0.0')}:{port}", + typ="async", + options=kwargs.get("options", None), + compression=kwargs.get("compression", None), + ssl=kwargs.get("ssl", False), + ssl_client_credentials=kwargs.get("ssl_client_credentials", None) + ) as channel: req = pb_health.HealthCheckRequest() req.service = "bentoml.grpc.v1.BentoService" health_stub = services_health.HealthStub(channel) @@ -149,12 +226,35 @@ class AsyncGrpcClient(AsyncClient): @classmethod async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncGrpcClient: - async with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None), interceptors=kwargs.get("interceptors", None)) as channel: - metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest())) + async with dispatch_channel( + url.replace(r"localhost", "0.0.0.0"), + typ="async", + options=kwargs.get("options", None), + compression=kwargs.get("compression", None), + ssl=kwargs.get("ssl", False), + ssl_client_credentials=kwargs.get("ssl_client_credentials", None), + interceptors=kwargs.get("interceptors", None) + ) as channel: + metadata = t.cast( + "ServiceMetadataResponse", + channel.unary_unary( + "/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString + )(pb.ServiceMetadataRequest()) + ) reflection = bentoml.Service(metadata.name) for api in metadata.apis: try: - reflection.apis[api.name] = InferenceAPI[t.Any](None, bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}), bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}), name=api.name, doc=api.docs) + reflection.apis[api.name] = InferenceAPI[t.Any]( + None, + bentoml.io.from_spec({ + "id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None) + }), + bentoml.io.from_spec({ + "id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None) + }), + name=api.name, + doc=api.docs + ) except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e) return cls(url, reflection, **kwargs) diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py index 4426b8af..45bc280a 100644 --- a/openllm-client/src/openllm_client/benmin/_http.py +++ b/openllm-client/src/openllm_client/benmin/_http.py @@ -51,7 +51,14 @@ class HttpClient(Client): if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") try: - reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/")) + reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any]( + None, + bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), + bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), + name=meth_spec["x-bentoml-name"], + doc=meth_spec["description"], + route=route.lstrip("/") + ) except Exception as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e) return cls(url, reflection) @@ -69,7 +76,12 @@ class HttpClient(Client): if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None else: body = fake_resp.body - resp = self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout) + resp = self.inner.post( + "/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, + data=body, + headers={"content-type": fake_resp.headers["content-type"]}, + timeout=self.timeout + ) if resp.status_code != 200: raise ValueError(f"Error while making request: {resp.status_code}: {resp.content!s}") fake_req = starlette.requests.Request(scope={"type": "http"}) headers = starlette.datastructures.Headers(headers=resp.headers) @@ -122,7 +134,14 @@ class AsyncHttpClient(AsyncClient): if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}") try: - reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/")) + reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any]( + None, + bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), + bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), + name=meth_spec["x-bentoml-name"], + doc=meth_spec["description"], + route=route.lstrip("/") + ) except ValueError as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e) return cls(url, reflection) @@ -140,7 +159,12 @@ class AsyncHttpClient(AsyncClient): if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None else: body = t.cast(t.Any, fake_resp.body) - resp = await self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout) + resp = await self.inner.post( + "/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, + data=body, + headers={"content-type": fake_resp.headers["content-type"]}, + timeout=self.timeout + ) if resp.status_code != 200: raise ValueError(f"Error making request: {resp.status_code}: {(await resp.aread())!s}") fake_req = starlette.requests.Request(scope={"type": "http"}) headers = starlette.datastructures.Headers(headers=resp.headers) diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 867ad829..c1dea50c 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -141,8 +141,19 @@ class FineTuneConfig: def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: ... - adapter_type: PeftType = dantic.Field("lora", description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", use_default_converter=False, converter=_adapter_converter) - adapter_config: t.Dict[str, t.Any] = dantic.Field(None, description="The configuration for the adapter. The content of the dict depends on the adapter type.", validator=attr.validators.optional(attr.validators.instance_of(dict)), converter=attr.converters.default_if_none(factory=dict), use_default_converter=False) + adapter_type: PeftType = dantic.Field( + "lora", + description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", + use_default_converter=False, + converter=_adapter_converter + ) + adapter_config: t.Dict[str, t.Any] = dantic.Field( + None, + description="The configuration for the adapter. The content of the dict depends on the adapter type.", + validator=attr.validators.optional(attr.validators.instance_of(dict)), + converter=attr.converters.default_if_none(factory=dict), + use_default_converter=False + ) inference_mode: bool = dantic.Field(False, description="Whether to use this Adapter for inference", use_default_converter=False) llm_config_class: type[LLMConfig] = dantic.Field(None, description="The reference class to openllm.LLMConfig", use_default_converter=False) @@ -175,41 +186,97 @@ class GenerationConfig(ReprMixin): via ``LLMConfig.generation_config``. """ max_new_tokens: int = dantic.Field(20, ge=0, description="The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.") - min_length: int = dantic.Field(0, ge=0, description="The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.") + min_length: int = dantic.Field( + 0, + ge=0, + description="The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set." + ) min_new_tokens: int = dantic.Field(description="The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.") - early_stopping: bool = dantic.Field(False, description="""Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm) """) - max_time: float = dantic.Field(description="The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed.") + early_stopping: bool = dantic.Field( + False, + description="""Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm) """ + ) + max_time: float = dantic.Field( + description="The maximum amount of time you allow the computation to run for in seconds. generation will still finish the current pass after allocated time has been passed." + ) num_beams: int = dantic.Field(1, description="Number of beams for beam search. 1 means no beam search.") - num_beam_groups: int = dantic.Field(1, description="Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.") + num_beam_groups: int = dantic.Field( + 1, + description="Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details." + ) penalty_alpha: float = dantic.Field(description="The values balance the model confidence and the degeneration penalty in contrastive search decoding.") use_cache: bool = dantic.Field(True, description="Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.") temperature: float = dantic.Field(1.0, ge=0.0, le=1.0, description="The value used to modulate the next token probabilities.") top_k: int = dantic.Field(50, description="The number of highest probability vocabulary tokens to keep for top-k-filtering.") - top_p: float = dantic.Field(1.0, description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.") - typical_p: float = dantic.Field(1.0, description="Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation. See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.") - epsilon_cutoff: float = dantic.Field(0.0, description="If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details.") + top_p: float = dantic.Field( + 1.0, description="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation." + ) + typical_p: float = dantic.Field( + 1.0, + description="Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to `typical_p` or higher are kept for generation. See [this paper](https://arxiv.org/pdf/2202.00666.pdf) for more details." + ) + epsilon_cutoff: float = dantic.Field( + 0.0, + description="If set to float strictly between 0 and 1, only tokens with a conditional probability greater than `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details." + ) eta_cutoff: float = dantic.Field( 0.0, description="""Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more details. """ ) - diversity_penalty: float = dantic.Field(0.0, description="This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. ") - repetition_penalty: float = dantic.Field(1.0, description="The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.") - encoder_repetition_penalty: float = dantic.Field(1.0, description="The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.") - length_penalty: float = dantic.Field(1.0, description="Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences.") + diversity_penalty: float = dantic.Field( + 0.0, + description="This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. " + ) + repetition_penalty: float = dantic.Field( + 1.0, description="The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details." + ) + encoder_repetition_penalty: float = dantic.Field( + 1.0, description="The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty." + ) + length_penalty: float = dantic.Field( + 1.0, + description="Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences." + ) no_repeat_ngram_size: int = dantic.Field(0, description="If set to int > 0, all ngrams of that size can only occur once.") - bad_words_ids: t.List[t.List[int]] = dantic.Field(description="List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.") - force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = dantic.Field(description="List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one can allow different forms of each word. ") - renormalize_logits: bool = dantic.Field(False, description="Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. ") - constraints: t.List[Constraint] = dantic.Field(description="Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible.") - forced_bos_token_id: int = dantic.Field(description="The id of the token to force as the first generated token after the ``decoder_start_token_id``. Useful for multilingual models like [mBART](https://huggingface.co/docs/transformers/model_doc/mbart) where the first generated token needs to be the target language token. ") - forced_eos_token_id: t.Union[int, t.List[int]] = dantic.Field(description="The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens.") - remove_invalid_values: bool = dantic.Field(False, description="Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation.") - exponential_decay_length_penalty: t.Tuple[int, float] = dantic.Field(description="This tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay") - suppress_tokens: t.List[int] = dantic.Field(description="A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled.") - begin_suppress_tokens: t.List[int] = dantic.Field(description="A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. ") - forced_decoder_ids: t.List[t.List[int]] = dantic.Field(description="A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123.") + bad_words_ids: t.List[t.List[int]] = dantic.Field( + description="List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`." + ) + force_words_ids: t.Union[t.List[t.List[int]], t.List[t.List[t.List[int]]]] = dantic.Field( + description="List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one can allow different forms of each word. " + ) + renormalize_logits: bool = dantic.Field( + False, + description="Whether to renormalize the logits after applying all the logits processors or warpers (including the custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits are normalized but some logit processors or warpers break the normalization. " + ) + constraints: t.List[Constraint] = dantic.Field( + description="Custom constraints that can be added to the generation to ensure that the output will contain the use of certain tokens as defined by ``Constraint`` objects, in the most sensible way possible." + ) + forced_bos_token_id: int = dantic.Field( + description="The id of the token to force as the first generated token after the ``decoder_start_token_id``. Useful for multilingual models like [mBART](https://huggingface.co/docs/transformers/model_doc/mbart) where the first generated token needs to be the target language token. " + ) + forced_eos_token_id: t.Union[int, t.List[int]] = dantic.Field( + description="The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a list to set multiple *end-of-sequence* tokens." + ) + remove_invalid_values: bool = dantic.Field( + False, + description="Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash. Note that using `remove_invalid_values` can slow down generation." + ) + exponential_decay_length_penalty: t.Tuple[int, float] = dantic.Field( + description="This tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty starts and `decay_factor` represents the factor of exponential decay" + ) + suppress_tokens: t.List[int] = dantic.Field( + description="A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their log probs to `-inf` so that they are not sampled." + ) + begin_suppress_tokens: t.List[int] = dantic.Field( + description="A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit processor will set their log probs to `-inf` so that they are not sampled. " + ) + forced_decoder_ids: t.List[t.List[int]] = dantic.Field( + description="A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123." + ) num_return_sequences: int = dantic.Field(1, description="The number of independently computed returned sequences for each element in the batch.") - output_attentions: bool = dantic.Field(False, description="""Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.""") + output_attentions: bool = dantic.Field( + False, description="""Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.""" + ) output_hidden_states: bool = dantic.Field(False, description="""Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.""") output_scores: bool = dantic.Field(False, description="""Whether or not to return the prediction scores. See `scores` under returned tensors for more details.""") pad_token_id: int = dantic.Field(description="The id of the *padding* token.") @@ -234,7 +301,18 @@ class GenerationConfig(ReprMixin): @property def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)} -bentoml_cattr.register_unstructure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)})) +bentoml_cattr.register_unstructure_hook_factory( + lambda cls: attr.has(cls) and lenient_issubclass(cls, GenerationConfig), + lambda cls: make_dict_unstructure_fn( + cls, + bentoml_cattr, + _cattrs_omit_if_default=False, + _cattrs_use_linecache=True, + **{ + k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) + } + ) +) @attr.frozen(slots=True, repr=False, init=False) class SamplingParams(ReprMixin): """SamplingParams is the attr-compatible version of ``vllm.SamplingParams``. It provides some utilities to also respect shared variables from ``openllm.LLMConfig``. @@ -246,9 +324,18 @@ class SamplingParams(ReprMixin): - max_tokens -> max_new_tokens """ n: int = dantic.Field(1, description="Number of output sequences to return for the given prompt.") - best_of: int = dantic.Field(None, description="Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` must be greater than or equal to `n`. This is treated as the beam width when `use_beam_search` is True. By default, `best_of` is set to `n`.") - presence_penalty: float = dantic.Field(0.0, description="Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.") - frequency_penalty: float = dantic.Field(0.0, description="Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.") + best_of: int = dantic.Field( + None, + description="Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` must be greater than or equal to `n`. This is treated as the beam width when `use_beam_search` is True. By default, `best_of` is set to `n`." + ) + presence_penalty: float = dantic.Field( + 0.0, + description="Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens." + ) + frequency_penalty: float = dantic.Field( + 0.0, + description="Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens." + ) use_beam_search: bool = dantic.Field(False, description="Whether to use beam search instead of sampling.") stop: t.List[str] = dantic.Field(None, description="List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.") ignore_eos: bool = dantic.Field(False, description="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") @@ -264,7 +351,8 @@ class SamplingParams(ReprMixin): ... def __init__(self, *, _internal: bool = False, **attrs: t.Any): - if not _internal: raise RuntimeError("SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'") + if not _internal: + raise RuntimeError("SamplingParams is not meant to be used directly, but you can access this via a LLMConfig.sampling_config or create one with 'SamplingParams.from_generation_config'") _object_setattr(self, "max_tokens", attrs.pop("max_tokens", 16)) _object_setattr(self, "temperature", attrs.pop("temperature", 1.0)) _object_setattr(self, "top_k", attrs.pop("top_k", -1)) @@ -294,8 +382,22 @@ class SamplingParams(ReprMixin): top_p = first_not_none(attrs.pop("top_p", None), default=generation_config["top_p"]) max_tokens = first_not_none(attrs.pop("max_tokens", None), attrs.pop("max_new_tokens", None), default=generation_config["max_new_tokens"]) return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs) -bentoml_cattr.register_unstructure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING)})) -bentoml_cattr.register_structure_hook_factory(lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename="max_tokens"))) +bentoml_cattr.register_unstructure_hook_factory( + lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), + lambda cls: make_dict_unstructure_fn( + cls, + bentoml_cattr, + _cattrs_omit_if_default=False, + _cattrs_use_linecache=True, + **{ + k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) + } + ) +) +bentoml_cattr.register_structure_hook_factory( + lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), + lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename="max_tokens")) +) # cached it here to save one lookup per assignment _object_getattribute = object.__getattribute__ @@ -344,7 +446,25 @@ class ModelSettings(t.TypedDict, total=False): # tokenizer_class is the custom tokenizer class for this given LLM tokenizer_class: t.Optional[str] _transformed_type: DictStrAny = {"fine_tune_strategies": t.Dict[AdapterType, FineTuneConfig], "default_implementation": t.Dict[LiteralResourceSpec, LiteralRuntime]} -@attr.define(frozen=False, slots=True, field_transformer=lambda _, __: [attr.Attribute.from_counting_attr(k, dantic.Field(kw_only=False if t.get_origin(ann) is not Required else True, auto_default=True, use_default_converter=False, type=_transformed_type.get(k, ann), metadata={"target": f"__openllm_{k}__"}, description=f"ModelSettings field for {k}.")) for k, ann in t.get_type_hints(ModelSettings).items()]) +@attr.define( + frozen=False, + slots=True, + field_transformer=lambda _, + __: [ + attr.Attribute.from_counting_attr( + k, + dantic.Field( + kw_only=False if t.get_origin(ann) is not Required else True, + auto_default=True, + use_default_converter=False, + type=_transformed_type.get(k, ann), + metadata={"target": f"__openllm_{k}__"}, + description=f"ModelSettings field for {k}." + ) + ) for k, + ann in t.get_type_hints(ModelSettings).items() + ] +) class _ModelSettingsAttr: """Internal attrs representation of ModelSettings.""" def __getitem__(self, key: str) -> t.Any: @@ -354,7 +474,30 @@ class _ModelSettingsAttr: @classmethod def default(cls) -> _ModelSettingsAttr: - return cls(**t.cast(DictStrAny, ModelSettings(default_id="__default__", model_ids=["__default__"], architecture="PreTrainedModel", default_implementation={"cpu": "pt", "nvidia.com/gpu": "pt"}, name_type="dasherize", requires_gpu=False, url="", model_type="causal_lm", trust_remote_code=False, requirements=None, tokenizer_class=None, timeout=int(36e6), service_name="", workers_per_resource=1., runtime="transformers"))) + return cls( + **t.cast( + DictStrAny, + ModelSettings( + default_id="__default__", + model_ids=["__default__"], + architecture="PreTrainedModel", + default_implementation={ + "cpu": "pt", "nvidia.com/gpu": "pt" + }, + name_type="dasherize", + requires_gpu=False, + url="", + model_type="causal_lm", + trust_remote_code=False, + requirements=None, + tokenizer_class=None, + timeout=int(36e6), + service_name="", + workers_per_resource=1., + runtime="transformers" + ) + ) + ) # NOTE: The below are dynamically generated by the field_transformer if t.TYPE_CHECKING: @@ -388,7 +531,8 @@ def get_default_implementation(default_implementation_mapping: dict[LiteralResou elif resource_spec("nvidia") in available_spec: return default_implementation_mapping.get(resource_spec("nvidia"), "pt") else: return default_implementation_mapping.get(resource_spec("cpu"), "pt") def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr: - if "generation_class" in cl_.__config__: raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") + if "generation_class" in cl_.__config__: + raise ValueError(f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") required_fields = {k for k, ann in t.get_type_hints(ModelSettings).items() if t.get_origin(ann) is Required} if any(i not in cl_.__config__ for i in required_fields): raise ValueError(f"Missing required fields {required_fields} '__config__'.") @@ -703,7 +847,9 @@ class _ConfigBuilder: return cls def add_attrs_init(self) -> Self: - self._cls_dict["__attrs_init__"] = codegen.add_method_dunders(self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True)) + self._cls_dict["__attrs_init__"] = codegen.add_method_dunders( + self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True) + ) return self def add_repr(self) -> Self: @@ -821,7 +967,24 @@ class LLMConfig(_ConfigAttr): def _make_subclass(class_attr: str, base: type[At], globs: dict[str, t.Any] | None = None, suffix_env: LiteralString | None = None) -> type[At]: camel_name = cls.__name__.replace("Config", "") - klass = attr.make_class(f"{camel_name}{class_attr}", [], bases=(base,), slots=True, weakref_slot=True, frozen=True, repr=False, init=False, collect_by_mro=True, field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__, suffix=suffix_env, globs=globs, default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default) if codegen.has_own_attribute(cls, class_attr) else field_default)) + klass = attr.make_class( + f"{camel_name}{class_attr}", [], + bases=(base,), + slots=True, + weakref_slot=True, + frozen=True, + repr=False, + init=False, + collect_by_mro=True, + field_transformer=codegen.make_env_transformer( + cls, + cls.__openllm_model_name__, + suffix=suffix_env, + globs=globs, + default_callback=lambda field_name, + field_default: getattr(getattr(cls, class_attr), field_name, field_default) if codegen.has_own_attribute(cls, class_attr) else field_default + ) + ) # For pickling to work, the __module__ variable needs to be set to the # frame where the class is created. This respect the module that is created from cls try: @@ -876,7 +1039,10 @@ class LLMConfig(_ConfigAttr): pass def __setattr__(self, attr: str, value: t.Any) -> None: - if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.") + if attr in _reserved_namespace: + raise ForbiddenAttributeError( + f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}." + ) super().__setattr__(attr, value) def __init__(self, *, generation_config: DictStrAny | None = None, __openllm_extras__: DictStrAny | None = None, **attrs: t.Any): @@ -1093,10 +1259,13 @@ class LLMConfig(_ConfigAttr): return list(self.__openllm_accepted_keys__) + list(self.__openllm_extras__) def values(self) -> list[t.Any]: - return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values())) + return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + [ + getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__) + ] + list(self.__openllm_extras__.values())) def items(self) -> list[tuple[str, t.Any]]: - return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items())) + return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items())) def __iter__(self) -> t.Iterator[str]: return iter(self.keys()) @@ -1127,9 +1296,12 @@ class LLMConfig(_ConfigAttr): raise ValueError("Cannot derivate a LLMConfig without __config__") _new_cfg = {k: v for k, v in attrs.items() if k in attr.fields_dict(_ModelSettingsAttr)} attrs = {k: v for k, v in attrs.items() if k not in _new_cfg} - new_cls = types.new_class(name or f"{cls.__name__.replace('Config', '')}DerivateConfig", (cls,), {}, lambda ns: ns.update({ - "__config__": config_merger.merge(copy.deepcopy(cls.__dict__["__config__"]), _new_cfg), "__base_config__": cls, # keep a reference for easy access - })) + new_cls = types.new_class( + name or f"{cls.__name__.replace('Config', '')}DerivateConfig", (cls,), {}, + lambda ns: ns.update({ + "__config__": config_merger.merge(copy.deepcopy(cls.__dict__["__config__"]), _new_cfg), "__base_config__": cls, # keep a reference for easy access + }) + ) # For pickling to work, the __module__ variable needs to be set to the # frame where the class is created. Bypass this step in environments where @@ -1286,7 +1458,9 @@ class LLMConfig(_ConfigAttr): `openllm.LLM` also has a postprocess_generate that will just call this method. """ return generation_result -bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True)) +bentoml_cattr.register_unstructure_hook_factory( + lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True) +) def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: """Structure a dictionary to a LLMConfig object. diff --git a/openllm-core/src/openllm_core/_prompt.py b/openllm-core/src/openllm_core/_prompt.py index b8ee17f3..4f8fc66a 100644 --- a/openllm-core/src/openllm_core/_prompt.py +++ b/openllm-core/src/openllm_core/_prompt.py @@ -23,4 +23,6 @@ def process_prompt(prompt: str, template: str | None = None, use_prompt_template try: return template.format(instruction=prompt, **prompt_variables) except KeyError as e: - raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template.") from None + raise RuntimeError( + f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. Use 'use_prompt_template=False' to disable the default prompt template." + ) from None diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py index 893150b7..93ab3702 100644 --- a/openllm-core/src/openllm_core/_schema.py +++ b/openllm-core/src/openllm_core/_schema.py @@ -28,7 +28,14 @@ class GenerationInput: @classmethod def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: - return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)}) + return attr.make_class( + inflection.camelize(llm_config["model_name"]) + "GenerationInput", + attrs={ + "prompt": attr.field(type=str), + "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), + "adapter_name": attr.field(default=None, type=str) + } + ) @attr.frozen(slots=True) class GenerationOutput: responses: t.List[t.Any] @@ -60,7 +67,16 @@ class EmbeddingsOutput: embeddings: t.List[t.List[float]] num_tokens: int def unmarshal_vllm_outputs(request_output: vllm.RequestOutput) -> dict[str, t.Any]: - return dict(request_id=request_output.request_id, prompt=request_output.prompt, finished=request_output.finished, prompt_token_ids=request_output.prompt_token_ids, outputs=[dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason) for it in request_output.outputs]) + return dict( + request_id=request_output.request_id, + prompt=request_output.prompt, + finished=request_output.finished, + prompt_token_ids=request_output.prompt_token_ids, + outputs=[ + dict(index=it.index, text=it.text, token_ids=it.token_ids, cumulative_logprob=it.cumulative_logprob, logprobs=it.logprobs, finish_reason=it.finish_reason) + for it in request_output.outputs + ] + ) @attr.define class HfAgentInput: inputs: str diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py index 78449934..28a6f49c 100644 --- a/openllm-core/src/openllm_core/_strategies.py +++ b/openllm-core/src/openllm_core/_strategies.py @@ -177,21 +177,40 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None: except (ImportError, RuntimeError): pass def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: - return types.new_class(name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"})) + return types.new_class( + name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, + lambda ns: ns.update({ + "resource_id": resource_kind, + "from_spec": classmethod(_from_spec), + "from_system": classmethod(_from_system), + "validate": classmethod(_validate), + "__repr_keys__": property(lambda _: {"resource_id"}), + "__doc__": inspect.cleandoc(docstring), + "__module__": "openllm._strategies" + }) + ) # NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm: _TPU_RESOURCE: t.Literal["cloud-tpus.google.com/v2"] = "cloud-tpus.google.com/v2" _AMD_GPU_RESOURCE: t.Literal["amd.com/gpu"] = "amd.com/gpu" _NVIDIA_GPU_RESOURCE: t.Literal["nvidia.com/gpu"] = "nvidia.com/gpu" _CPU_RESOURCE: t.Literal["cpu"] = "cpu" -NvidiaGpuResource = _make_resource_class("NvidiaGpuResource", _NVIDIA_GPU_RESOURCE, """NVIDIA GPU resource. +NvidiaGpuResource = _make_resource_class( + "NvidiaGpuResource", + _NVIDIA_GPU_RESOURCE, + """NVIDIA GPU resource. This is a modified version of internal's BentoML's NvidiaGpuResource - where it respects and parse CUDA_VISIBLE_DEVICES correctly.""") -AmdGpuResource = _make_resource_class("AmdGpuResource", _AMD_GPU_RESOURCE, """AMD GPU resource. + where it respects and parse CUDA_VISIBLE_DEVICES correctly.""" +) +AmdGpuResource = _make_resource_class( + "AmdGpuResource", + _AMD_GPU_RESOURCE, + """AMD GPU resource. Since ROCm will respect CUDA_VISIBLE_DEVICES, the behaviour of from_spec, from_system are similar to - ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""") + ``NvidiaGpuResource``. Currently ``validate`` is not yet supported.""" +) LiteralResourceSpec = t.Literal["cloud-tpus.google.com/v2", "amd.com/gpu", "nvidia.com/gpu", "cpu"] # convenient mapping diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index c845202d..090e74cd 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -10,7 +10,10 @@ if t.TYPE_CHECKING: from bentoml._internal.runner.strategy import Strategy from .utils.lazy import VersionInfo -M = t.TypeVar("M", bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]") +M = t.TypeVar( + "M", + bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]" +) T = t.TypeVar("T", bound="t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]") AnyCallable = t.Callable[..., t.Any] @@ -79,7 +82,19 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]] generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] - def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: + def __init__( + self, + runnable_class: type[LLMRunnable[M, T]], + *, + runnable_init_params: dict[str, t.Any] | None = ..., + name: str | None = ..., + scheduling_strategy: type[Strategy] = ..., + models: list[bentoml.Model] | None = ..., + max_batch_size: int | None = ..., + max_latency_ms: int | None = ..., + method_configs: dict[str, dict[str, int]] | None = ..., + embedded: bool = False, + ) -> None: ... def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index 0c878b69..826e18bb 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -13,7 +13,9 @@ if t.TYPE_CHECKING: ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]] # NOTE: This is the entrypoint when adding new model config -CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ("llama", "LlamaConfig"), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")]) +CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ( + "llama", "LlamaConfig" +), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")]) class _LazyConfigMapping(OrderedDict, ReprMixin): def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]): self._mapping = mapping diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py index 68a85482..c3594dcd 100644 --- a/openllm-core/src/openllm_core/config/configuration_baichuan.py +++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py @@ -31,14 +31,33 @@ class BaichuanConfig(openllm_core.LLMConfig): and English benchmarks (C-Eval, MMLU, etc). Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM", "default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]} + __config__ = { + "name_type": "lowercase", + "trust_remote_code": True, + "timeout": 3600000, + "requires_gpu": True, + "url": "https://github.com/baichuan-inc/Baichuan-7B", + "requirements": ["cpm-kernels", "sentencepiece"], + "architecture": "BaiChuanForCausalLM", + "default_id": "baichuan-inc/baichuan-7b", + "model_ids": [ + "baichuan-inc/baichuan-7b", + "baichuan-inc/baichuan-13b-base", + "baichuan-inc/baichuan-13b-chat", + "fireballoon/baichuan-vicuna-chinese-7b", + "fireballoon/baichuan-vicuna-7b", + "hiyouga/baichuan-7b-sft" + ] + } class GenerationConfig: max_new_tokens: int = 2048 top_p: float = 0.7 temperature: float = 0.95 - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters( + self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py index d70d9aba..504709e1 100644 --- a/openllm-core/src/openllm_core/config/configuration_chatglm.py +++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py @@ -35,7 +35,17 @@ class ChatGLMConfig(openllm_core.LLMConfig): Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration", "default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]} + __config__ = { + "name_type": "lowercase", + "trust_remote_code": True, + "timeout": 3600000, + "requires_gpu": True, + "url": "https://github.com/THUDM/ChatGLM-6B", + "requirements": ["cpm-kernels", "sentencepiece"], + "architecture": "ChatGLMForConditionalGeneration", + "default_id": "thudm/chatglm-6b", + "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"] + } retain_history: bool = dantic.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.") use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.") @@ -45,7 +55,17 @@ class ChatGLMConfig(openllm_core.LLMConfig): top_p: float = 0.7 temperature: float = 0.95 - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + num_beams: int | None = None, + top_p: float | None = None, + temperature: float | None = None, + chat_history: list[tuple[str, str]] | None = None, + use_default_prompt_template: bool = False, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: prompt_text = "" if use_default_prompt_template and chat_history is not None: for i, (old_query, response) in enumerate(chat_history): diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py index d7a47b99..e75b8d64 100644 --- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py +++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py @@ -66,7 +66,13 @@ class DollyV2Config(openllm_core.LLMConfig): Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information. """ - __config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM", "default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]} + __config__ = { + "timeout": 3600000, + "url": "https://github.com/databrickslabs/dolly", + "architecture": "GPTNeoXForCausalLM", + "default_id": "databricks/dolly-v2-3b", + "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"] + } return_full_text: bool = dantic.Field(False, description="Whether to return the full prompt to the users.") class GenerationConfig: @@ -76,8 +82,19 @@ class DollyV2Config(openllm_core.LLMConfig): max_new_tokens: int = 256 eos_token_id: int = 50277 # NOTE: from get_special_token_id(self.tokenizer, END_KEY) - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {} + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_k: int | None = None, + top_p: float | None = None, + use_default_prompt_template: bool = True, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs + }, {} def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"] diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py index e7e1869c..0a8a8842 100644 --- a/openllm-core/src/openllm_core/config/configuration_falcon.py +++ b/openllm-core/src/openllm_core/config/configuration_falcon.py @@ -55,8 +55,19 @@ class FalconConfig(openllm_core.LLMConfig): num_beams: int = 4 early_stopping: bool = True - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {} + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + top_k: int | None = None, + num_return_sequences: int | None = None, + eos_token_id: int | None = None, + use_default_prompt_template: bool = False, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs + }, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py index dbfd8fdc..e0a328a9 100644 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py @@ -34,7 +34,13 @@ class FlanT5Config(openllm_core.LLMConfig): Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information. """ - __config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm", "default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]} + __config__ = { + "url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", + "architecture": "T5ForConditionalGeneration", + "model_type": "seq2seq_lm", + "default_id": "google/flan-t5-large", + "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",] + } class GenerationConfig: temperature: float = 0.9 @@ -43,8 +49,20 @@ class FlanT5Config(openllm_core.LLMConfig): top_p: float = 0.4 repetition_penalty = 1.0 - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {} + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_k: int | None = None, + top_p: float | None = None, + repetition_penalty: float | None = None, + use_default_prompt_template: bool = True, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty + }, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py index 537ecbc8..0913ddcf 100644 --- a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py +++ b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py @@ -37,14 +37,23 @@ class GPTNeoXConfig(openllm_core.LLMConfig): Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox) for more information. """ - __config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox", "default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]} + __config__ = { + "model_name": "gpt_neox", + "start_name": "gpt-neox", + "requires_gpu": True, + "architecture": "GPTNeoXForCausalLM", + "url": "https://github.com/EleutherAI/gpt-neox", + "default_id": "eleutherai/gpt-neox-20b", + "model_ids": ["eleutherai/gpt-neox-20b"] + } use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.") class GenerationConfig: temperature: float = 0.9 max_new_tokens: int = 100 - def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, + **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {} def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index b0ebc164..8982f113 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -102,8 +102,20 @@ class LlamaConfig(openllm_core.LLMConfig): best_of: int = 1 presence_penalty: float = 0.5 - def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {} + def sanitize_parameters( + self, + prompt: str, + top_k: int | None = None, + top_p: float | None = None, + temperature: float | None = None, + max_new_tokens: int | None = None, + use_default_prompt_template: bool = False, + use_llama2_prompt: bool = True, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k + }, {} def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py index 3473729d..34713421 100644 --- a/openllm-core/src/openllm_core/config/configuration_mpt.py +++ b/openllm-core/src/openllm_core/config/configuration_mpt.py @@ -53,16 +53,39 @@ class MPTConfig(openllm_core.LLMConfig): on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml) for more details on specific models. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM", "default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]} + __config__ = { + "name_type": "lowercase", + "trust_remote_code": True, + "url": "https://huggingface.co/mosaicml", + "timeout": int(36e6), + "requirements": ["triton", "einops"], + "architecture": "MPTForCausalLM", + "default_id": "mosaicml/mpt-7b-instruct", + "model_ids": [ + "mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat" + ] + } prompt_type: MPTPromptType = dantic.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.") - max_sequence_length: int = dantic.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)") + max_sequence_length: int = dantic.Field( + 2048, + description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)" + ) class GenerationConfig: max_new_tokens: int = 128 temperature: float = 0 top_p: float = 0.8 - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_p: float | None = None, + prompt_type: MPTPromptType | None = None, + use_default_prompt_template: bool = True, + **attrs: t.Any, + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: _template = None if use_default_prompt_template: if prompt_type is None: diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py index 1731c944..68163419 100644 --- a/openllm-core/src/openllm_core/config/configuration_opt.py +++ b/openllm-core/src/openllm_core/config/configuration_opt.py @@ -38,7 +38,17 @@ class OPTConfig(openllm_core.LLMConfig): Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information. """ - __config__ = {"name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt", "default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"], "fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)} + __config__ = { + "name_type": "lowercase", + "trust_remote_code": False, + "url": "https://huggingface.co/docs/transformers/model_doc/opt", + "default_id": "facebook/opt-1.3b", + "architecture": "OPTForCausalLM", + "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"], + "fine_tune_strategies": ({ + "adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none" + },) + } format_outputs: bool = dantic.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""") class GenerationConfig: @@ -47,8 +57,19 @@ class OPTConfig(openllm_core.LLMConfig): max_new_tokens: int = 1024 num_return_sequences: int = 1 - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {} + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_k: int | None = None, + num_return_sequences: int | None = None, + use_default_prompt_template: bool = False, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences + }, {} def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: if len(generation_result) == 1: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py index 13478a4d..e5dec95b 100644 --- a/openllm-core/src/openllm_core/config/configuration_stablelm.py +++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py @@ -41,7 +41,13 @@ class StableLMConfig(openllm_core.LLMConfig): and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) for more information. """ - __config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM", "default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]} + __config__ = { + "name_type": "lowercase", + "url": "https://github.com/Stability-AI/StableLM", + "architecture": "GPTNeoXForCausalLM", + "default_id": "stabilityai/stablelm-tuned-alpha-3b", + "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"] + } class GenerationConfig: temperature: float = 0.9 @@ -49,7 +55,16 @@ class StableLMConfig(openllm_core.LLMConfig): top_k: int = 0 top_p: float = 0.9 - def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters( + self, + prompt: str, + temperature: float | None = None, + max_new_tokens: int | None = None, + top_k: int | None = None, + top_p: float | None = None, + use_default_prompt_template: bool = False, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: if "tuned" in self._model_id and use_default_prompt_template: system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT) prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs) diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py index 86c19580..7aafb1b5 100644 --- a/openllm-core/src/openllm_core/config/configuration_starcoder.py +++ b/openllm-core/src/openllm_core/config/configuration_starcoder.py @@ -30,7 +30,16 @@ class StarCoderConfig(openllm_core.LLMConfig): Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information. """ - __config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5, "default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]} + __config__ = { + "name_type": "lowercase", + "requires_gpu": True, + "url": "https://github.com/bigcode-project/starcoder", + "architecture": "GPTBigCodeForCausalLM", + "requirements": ["bitsandbytes"], + "workers_per_resource": 0.5, + "default_id": "bigcode/starcoder", + "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"] + } class GenerationConfig: temperature: float = 0.2 @@ -41,7 +50,9 @@ class StarCoderConfig(openllm_core.LLMConfig): pad_token_id: int = 49152 repetition_penalty: float = 1.2 - def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters( + self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None if fim_mode: try: diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 0482e834..a503350f 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -6,10 +6,25 @@ from __future__ import annotations import contextlib, functools, hashlib, logging, logging.config, os, sys, types, typing as t, openllm_core, asyncio from pathlib import Path from circus.exc import ConflictError -from bentoml._internal.configuration import (DEBUG_ENV_VAR as DEBUG_ENV_VAR, GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR, QUIET_ENV_VAR as QUIET_ENV_VAR, get_debug_mode as _get_debug_mode, get_quiet_mode as _get_quiet_mode, set_quiet_mode as set_quiet_mode,) +from bentoml._internal.configuration import ( + DEBUG_ENV_VAR as DEBUG_ENV_VAR, + GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR, + QUIET_ENV_VAR as QUIET_ENV_VAR, + get_debug_mode as _get_debug_mode, + get_quiet_mode as _get_quiet_mode, + set_quiet_mode as set_quiet_mode, +) from bentoml._internal.models.model import ModelContext as _ModelContext from bentoml._internal.types import LazyType as LazyType -from bentoml._internal.utils import (LazyLoader as LazyLoader, bentoml_cattr as bentoml_cattr, calc_dir_size as calc_dir_size, first_not_none as first_not_none, pkg as pkg, reserve_free_port as reserve_free_port, resolve_user_filepath as resolve_user_filepath,) +from bentoml._internal.utils import ( + LazyLoader as LazyLoader, + bentoml_cattr as bentoml_cattr, + calc_dir_size as calc_dir_size, + first_not_none as first_not_none, + pkg as pkg, + reserve_free_port as reserve_free_port, + resolve_user_filepath as resolve_user_filepath, +) from openllm_core.utils.lazy import (LazyModule as LazyModule, VersionInfo as VersionInfo,) if t.TYPE_CHECKING: @@ -105,14 +120,16 @@ _LOGGING_CONFIG: dict[str, t.Any] = { "handlers": { "bentomlhandler": { "class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout" - }, "defaulthandler": { + }, + "defaulthandler": { "class": "logging.StreamHandler", "level": logging.WARNING } }, "loggers": { "bentoml": { "handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False - }, "openllm": { + }, + "openllm": { "handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False } }, diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py index ae31dd1f..fd89550a 100644 --- a/openllm-core/src/openllm_core/utils/codegen.py +++ b/openllm-core/src/openllm_core/utils/codegen.py @@ -82,14 +82,18 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t. return globs[attr_class_name] def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>" -def generate_function(typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None) -> AnyCallable: +def generate_function( + typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None +) -> AnyCallable: from openllm_core.utils import SHOW_CODEGEN script = "def %s(%s):\n %s\n" % (func_name, ", ".join(args) if args is not None else "", "\n ".join(lines) if lines else "pass") meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs) if annotations: meth.__annotations__ = annotations if SHOW_CODEGEN: logger.info("Generated script for %s:\n\n%s", typ, script) return meth -def make_env_transformer(cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable: +def make_env_transformer( + cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None, +) -> AnyCallable: from openllm_core.utils import dantic, field_env_key def identity(_: str, x_value: t.Any) -> t.Any: @@ -98,7 +102,19 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig], model_name: str, suf default_callback = identity if default_callback is None else default_callback globs = {} if globs is None else globs globs.update({"__populate_env": dantic.env_converter, "__default_callback": default_callback, "__field_env": field_env_key, "__suffix": suffix or "", "__model_name": model_name,}) - lines: ListStr = ["__env = lambda field_name: __field_env(__model_name, field_name, __suffix)", "return [", " f.evolve(", " default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),", " metadata={", " 'env': f.metadata.get('env', __env(f.name)),", " 'description': f.metadata.get('description', '(not provided)'),", " },", " )", " for f in fields", "]"] + lines: ListStr = [ + "__env = lambda field_name: __field_env(__model_name, field_name, __suffix)", + "return [", + " f.evolve(", + " default=__populate_env(__default_callback(f.name, f.default), __env(f.name)),", + " metadata={", + " 'env': f.metadata.get('env', __env(f.name)),", + " 'description': f.metadata.get('description', '(not provided)'),", + " },", + " )", + " for f in fields", + "]" + ] fields_ann = "list[attr.Attribute[t.Any]]" return generate_function(cls, "__auto_env", lines, args=("_", "fields"), globs=globs, annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann}) def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: @@ -115,5 +131,20 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: if func.__doc__ is None: doc = f"Generated SDK for {func.__name__}" else: doc = func.__doc__ - return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm"}),)(func, **attrs), func,)) + return t.cast( + _T, + functools.update_wrapper( + types.new_class( + name, (t.cast("PartialAny", functools.partial), ReprMixin), + exec_body=lambda ns: ns.update({ + "__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), + "__repr_args__": _repr_args, + "__repr__": _repr, + "__doc__": inspect.cleandoc(doc), + "__module__": "openllm" + }), + )(func, **attrs), + func, + ) + ) __all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function"] diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py index 87e1a3d4..8dcdf25f 100644 --- a/openllm-core/src/openllm_core/utils/dantic.py +++ b/openllm-core/src/openllm_core/utils/dantic.py @@ -3,17 +3,33 @@ from __future__ import annotations import functools, importlib, os, sys, typing as t from enum import Enum import attr, click, click_option_group as cog, inflection, orjson -from click import (ParamType, shell_completion as sc, types as click_types,) - +from click import ParamType, shell_completion as sc, types as click_types if t.TYPE_CHECKING: from attr import _ValidatorType - AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar("FC", bound=t.Union[AnyCallable, click.Command]) -__all__ = ["FC", "attrs_to_options", "Field", "parse_type", "is_typing", "is_literal", "ModuleType", "EnumChoice", "LiteralChoice", "allows_multiple", "is_mapping", "is_container", "parse_container_args", "parse_single_arg", "CUDA", "JsonType", "BytesType"] +__all__ = [ + "FC", + "attrs_to_options", + "Field", + "parse_type", + "is_typing", + "is_literal", + "ModuleType", + "EnumChoice", + "LiteralChoice", + "allows_multiple", + "is_mapping", + "is_container", + "parse_container_args", + "parse_single_arg", + "CUDA", + "JsonType", + "BytesType" +] def __dir__() -> list[str]: return sorted(__all__) -def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any | None = None, suffix_generation: bool = False, suffix_sampling: bool = False,) -> t.Callable[[FC], FC]: +def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, typ: t.Any = None, suffix_generation: bool = False, suffix_sampling: bool = False) -> t.Callable[[FC], FC]: # TODO: support parsing nested attrs class and Union envvar = field.metadata["env"] dasherized = inflection.dasherize(name) @@ -29,7 +45,18 @@ def attrs_to_options(name: str, field: attr.Attribute[t.Any], model_name: str, t elif suffix_sampling: identifier = f"{model_name}_sampling_{underscored}" else: identifier = f"{model_name}_{underscored}" - return cog.optgroup.option(identifier, full_option_name, type=parse_type(typ), required=field.default is attr.NOTHING, default=field.default if field.default not in (attr.NOTHING, None) else None, show_default=True, multiple=allows_multiple(typ) if typ else False, help=field.metadata.get("description", "(No description provided)"), show_envvar=True, envvar=envvar,) + return cog.optgroup.option( + identifier, + full_option_name, + type=parse_type(typ), + required=field.default is attr.NOTHING, + default=field.default if field.default not in (attr.NOTHING, None) else None, + show_default=True, + multiple=allows_multiple(typ) if typ else False, + help=field.metadata.get("description", "(No description provided)"), + show_envvar=True, + envvar=envvar, + ) def env_converter(value: t.Any, env: str | None = None) -> t.Any: if env is not None: value = os.environ.get(env, value) @@ -39,7 +66,18 @@ def env_converter(value: t.Any, env: str | None = None) -> t.Any: except orjson.JSONDecodeError as err: raise RuntimeError(f"Failed to parse ({value!r}) from '{env}': {err}") from None return value -def Field(default: t.Any = None, *, ge: int | float | None = None, le: int | float | None = None, validator: _ValidatorType[t.Any] | None = None, description: str | None = None, env: str | None = None, auto_default: bool = False, use_default_converter: bool = True, **attrs: t.Any) -> t.Any: +def Field( + default: t.Any = None, + *, + ge: int | float | None = None, + le: int | float | None = None, + validator: _ValidatorType[t.Any] | None = None, + description: str | None = None, + env: str | None = None, + auto_default: bool = False, + use_default_converter: bool = True, + **attrs: t.Any +) -> t.Any: """A decorator that extends attr.field with additional arguments, which provides the same interface as pydantic's Field. By default, if both validator and ge are provided, then then ge will be diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index 35aa07ee..31cafbb2 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -104,7 +104,19 @@ def is_tf_available() -> bool: _tf_version = None if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES: if _tf_available: - candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos", "tensorflow-aarch64",) + candidates = ( + "tensorflow", + "tensorflow-cpu", + "tensorflow-gpu", + "tf-nightly", + "tf-nightly-cpu", + "tf-nightly-gpu", + "intel-tensorflow", + "intel-tensorflow-avx512", + "tensorflow-rocm", + "tensorflow-macos", + "tensorflow-aarch64", + ) _tf_version = None # For the metadata, we have to look for both tensorflow and tensorflow-cpu for _pkg in candidates: @@ -240,9 +252,15 @@ You can install it with pip: `pip install fairscale`. Please note that you may n your runtime after installation. """ -BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("vllm", (is_vllm_available, VLLM_IMPORT_ERROR)), ("cpm_kernels", (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)), ("triton", (is_triton_available, TRITON_IMPORT_ERROR)), ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ( - "peft", (is_peft_available, PEFT_IMPORT_ERROR) -), ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), ("auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ("xformers", (is_xformers_available, XFORMERS_IMPORT_ERROR)), ("fairscale", (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))]) +BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ( + "torch", (is_torch_available, PYTORCH_IMPORT_ERROR) +), ("vllm", (is_vllm_available, VLLM_IMPORT_ERROR)), ("cpm_kernels", (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)), ( + "triton", (is_triton_available, TRITON_IMPORT_ERROR) +), ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ("peft", (is_peft_available, PEFT_IMPORT_ERROR)), ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), ( + "auto-gptq", (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR) +), ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ("xformers", (is_xformers_available, XFORMERS_IMPORT_ERROR)), ( + "fairscale", (is_fairscale_available, FAIRSCALE_IMPORT_ERROR) +)]) class DummyMetaclass(abc.ABCMeta): """Metaclass for dummy object. @@ -325,7 +343,15 @@ class EnvVarMixin(ReprMixin): elif hasattr(self, item): return getattr(self, item) raise KeyError(f"Key {item} not found in {self}") - def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None: + def __init__( + self, + model_name: str, + implementation: LiteralRuntime = "pt", + model_id: str | None = None, + bettertransformer: bool | None = None, + quantize: LiteralString | None = None, + runtime: t.Literal["ggml", "transformers"] = "transformers" + ) -> None: """EnvVarMixin is a mixin class that returns the value extracted from environment variables.""" from openllm_core.utils import field_env_key self.model_name = inflection.underscore(model_name) diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py index daf4e560..5205ac69 100644 --- a/openllm-core/src/openllm_core/utils/lazy.py +++ b/openllm-core/src/openllm_core/utils/lazy.py @@ -42,7 +42,15 @@ class VersionInfo: _sentinel, _reserved_namespace = object(), {"__openllm_migration__"} class LazyModule(types.ModuleType): # Very heavily inspired by optuna.integration._IntegrationModule: https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py - def __init__(self, name: str, module_file: str, import_structure: dict[str, list[str]], module_spec: importlib.machinery.ModuleSpec | None = None, doc: str | None = None, extra_objects: dict[str, t.Any] | None = None): + def __init__( + self, + name: str, + module_file: str, + import_structure: dict[str, list[str]], + module_spec: importlib.machinery.ModuleSpec | None = None, + doc: str | None = None, + extra_objects: dict[str, t.Any] | None = None + ): """Lazily load this module as an object. It does instantiate a __all__ and __dir__ for IDE support @@ -86,9 +94,24 @@ class LazyModule(types.ModuleType): It also contains a special case for all of the metadata information, such as __version__ and __version_info__. """ if name in _reserved_namespace: raise openllm_core.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.") - dunder_to_metadata = {"__title__": "Name", "__copyright__": "", "__version__": "version", "__version_info__": "version", "__description__": "summary", "__uri__": "", "__url__": "", "__author__": "", "__email__": "", "__license__": "license", "__homepage__": ""} + dunder_to_metadata = { + "__title__": "Name", + "__copyright__": "", + "__version__": "version", + "__version_info__": "version", + "__description__": "summary", + "__uri__": "", + "__url__": "", + "__author__": "", + "__email__": "", + "__license__": "license", + "__homepage__": "" + } if name in dunder_to_metadata: - if name not in {"__version_info__", "__copyright__", "__version__"}: warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", DeprecationWarning, stacklevel=2) + if name not in {"__version_info__", "__copyright__", "__version__"}: + warnings.warn( + f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", DeprecationWarning, stacklevel=2 + ) meta = importlib.metadata.metadata("openllm") project_url = dict(url.split(", ") for url in t.cast(t.List[str], meta.get_all("Project-URL"))) if name == "__license__": return "Apache-2.0" diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index 030dafd6..3101350c 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -102,7 +102,9 @@ else: try: if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError except exceptions.MissingDependencyError: - _import_structure["utils.dummy_pt_objects"] = [name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")] + _import_structure["utils.dummy_pt_objects"] = [ + name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations") + ] else: _import_structure["models.flan_t5"].extend(["FlanT5"]) _import_structure["models.dolly_v2"].extend(["DollyV2"]) diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py index af91e6d3..6c582cca 100644 --- a/openllm-python/src/openllm/_embeddings.py +++ b/openllm-python/src/openllm/_embeddings.py @@ -12,9 +12,24 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: try: return bentoml.transformers.get(ids) except bentoml.exceptions.NotFound: - model_signatures = {k: ModelSignature(batchable=False) for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__")} - with bentoml.models.create(ids, module=MODULE_NAME, api_version=API_VERSION, options=ModelOptions(), context=openllm.utils.generate_context(framework_name="transformers"), labels={"runtime": "pt", "framework": "openllm"}, signatures=model_signatures) as bentomodel: - snapshot_download(_GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors", "*.h5", "*.ot", "*.pdf", "*.md", ".gitattributes", "LICENSE.txt"]) + model_signatures = { + k: ModelSignature(batchable=False) + for k in ("forward", "generate", "contrastive_search", "greedy_search", "sample", "beam_search", "beam_sample", "group_beam_search", "constrained_beam_search", "__call__") + } + with bentoml.models.create( + ids, + module=MODULE_NAME, + api_version=API_VERSION, + options=ModelOptions(), + context=openllm.utils.generate_context(framework_name="transformers"), + labels={ + "runtime": "pt", "framework": "openllm" + }, + signatures=model_signatures + ) as bentomodel: + snapshot_download( + _GENERIC_EMBEDDING_ID, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=["*.safetensors", "*.h5", "*.ot", "*.pdf", "*.md", ".gitattributes", "LICENSE.txt"] + ) return bentomodel class GenericEmbeddingRunnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 03fc45fa..79b7a055 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -1,16 +1,12 @@ from __future__ import annotations -import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc -from abc import ABC, abstractmethod -from pathlib import Path +import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc, pathlib, abc from huggingface_hub import hf_hub_download from bentoml._internal.models.model import ModelSignature - from openllm_core._configuration import FineTuneConfig, LLMConfig, _object_getattribute, _setattr_class -from ._quantisation import infer_quantisation_config from openllm_core._schema import unmarshal_vllm_outputs -from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException -from .models.auto import AutoConfig from openllm_core.utils import DEBUG, ENV_VARS_TRUE_VALUES, MYPY, EnvVarMixin, LazyLoader, ReprMixin, apply, bentoml_cattr, codegen, device_count, first_not_none, generate_hash_from_file, is_peft_available, is_torch_available, non_intrusive_setattr, normalize_attrs_to_model_tokenizer_pair, resolve_filepath, validate_is_path +from ._quantisation import infer_quantisation_config +from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException from .utils import infer_auto_class from openllm_core._typing_compat import AdaptersMapping, AdaptersTuple, AnyCallable, AdapterType, LiteralRuntime, DictStrAny, ListStr, LLMEmbeddings, LLMRunnable, LLMRunner, ModelSignatureDict as _ModelSignatureDict, PeftAdapterOutput, TupleAny, NotRequired, overload, M, T, LiteralString @@ -68,7 +64,7 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp resolved[_peft_type] += (_AdaptersTuple((path_or_adapter_id, resolve_name, resolved_config)),) return resolved _reserved_namespace = {"config_class", "model", "tokenizer", "import_kwargs"} -class LLMInterface(ABC, t.Generic[M, T]): +class LLMInterface(abc.ABC, t.Generic[M, T]): """This defines the loose contract for all openllm.LLM implementations.""" @property def import_kwargs(self) -> tuple[DictStrAny, DictStrAny] | None: @@ -91,7 +87,7 @@ class LLMInterface(ABC, t.Generic[M, T]): """ raise NotImplementedError - @abstractmethod + @abc.abstractmethod def generate(self, prompt: str, **preprocess_generate_kwds: t.Any) -> t.Any: """The implementation for text generation from given prompt. @@ -141,7 +137,7 @@ class LLMInterface(ABC, t.Generic[M, T]): """ raise NotImplementedError - def save_pretrained(self, save_directory: str | Path, **attrs: t.Any) -> None: + def save_pretrained(self, save_directory: str | pathlib.Path, **attrs: t.Any) -> None: """This function defines how this model can be saved to local store. This will be called during ``import_model``. By default, it will use ``openllm.serialisation.save_pretrained``. @@ -234,7 +230,7 @@ class _llm_post_init_wrapper(t.Generic[M, T], t.Protocol): def __call__(self, llm: LLM[M, T]) -> T: ... class _save_pretrained_wrapper(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None: + def __call__(self, llm: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None: ... _object_setattr = object.__setattr__ # NOTE: the following wrapper are a light meta ops for wrapping default params to internal methods implementation. @@ -250,7 +246,9 @@ def _wrapped_import_model(f: _import_model_wrapper[bentoml.Model, M, T]) -> t.Ca return wrapper _DEFAULT_TOKENIZER = "hf-internal-testing/llama-tokenizer" def get_engine_args(llm: LLM[M, T], tokenizer: str = _DEFAULT_TOKENIZER) -> vllm.EngineArgs: - return vllm.EngineArgs(model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode="auto", tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype="auto", worker_use_ray=False) + return vllm.EngineArgs( + model=llm._bentomodel.path, tokenizer=tokenizer, tokenizer_mode="auto", tensor_parallel_size=1 if device_count() < 2 else device_count(), dtype="auto", worker_use_ray=False + ) def _wrapped_load_model(f: _load_model_wrapper[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]: @functools.wraps(f) def wrapper(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: @@ -279,12 +277,13 @@ def _wrapped_llm_post_init(f: _llm_post_init_wrapper[M, T]) -> t.Callable[[LLM[M f(self) return wrapper -def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | Path], None]: +def _wrapped_save_pretrained(f: _save_pretrained_wrapper[M, T]) -> t.Callable[[LLM[M, T], str | pathlib.Path], None]: @functools.wraps(f) - def wrapper(self: LLM[M, T], save_directory: str | Path, **attrs: t.Any) -> None: - if isinstance(save_directory, Path): save_directory = str(save_directory) + def wrapper(self: LLM[M, T], save_directory: str | pathlib.Path, **attrs: t.Any) -> None: + if isinstance(save_directory, pathlib.Path): save_directory = str(save_directory) if self.__llm_model__ is None: raise RuntimeError("Cannot 'save_pretrained' with unload model instance.") - if self.bettertransformer and self.__llm_implementation__ == "pt": _object_setattr(self, "__llm_model__", t.cast("transformers.PreTrainedModel", self.__llm_model__).reverse_bettertransformer()) + if self.bettertransformer and self.__llm_implementation__ == "pt": + _object_setattr(self, "__llm_model__", t.cast("transformers.PreTrainedModel", self.__llm_model__).reverse_bettertransformer()) f(self, save_directory, **attrs) return wrapper @@ -300,7 +299,13 @@ def _update_docstring(cls: LLM[M, T], fn: str) -> AnyCallable: setattr(cls, fn, original_fn) return original_fn def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], None]: - attributes = {"import_model": _wrapped_import_model, "load_model": _wrapped_load_model, "load_tokenizer": _wrapped_load_tokenizer, "llm_post_init": _wrapped_llm_post_init, "save_pretrained": _wrapped_save_pretrained} + attributes = { + "import_model": _wrapped_import_model, + "load_model": _wrapped_load_model, + "load_tokenizer": _wrapped_load_tokenizer, + "llm_post_init": _wrapped_llm_post_init, + "save_pretrained": _wrapped_save_pretrained + } args: ListStr = [] anns: DictStrAny = {} lines: ListStr = [] @@ -372,7 +377,7 @@ class LLM(LLMInterface[M, T], ReprMixin): cd = cls.__dict__ implementation, config_class_name = cls._infer_implementation_from_name(cls.__name__) cls.__llm_implementation__ = implementation - config_class = AutoConfig.infer_class_from_name(config_class_name) + config_class = openllm.AutoConfig.infer_class_from_name(config_class_name) if "__openllm_internal__" in cd: if "config_class" not in cd: cls.config_class = config_class elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.") @@ -532,11 +537,14 @@ class LLM(LLMInterface[M, T], ReprMixin): return f"{cls.__llm_implementation__}-{model_name}:{maybe_revision[0]}" tag_name = f"{cls.__llm_implementation__}-{model_name}" - if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag) + if os.environ.get("OPENLLM_USE_LOCAL_LATEST", str(False)).upper() in ENV_VARS_TRUE_VALUES: + return bentoml_cattr.unstructure(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag) if validate_is_path(model_id): model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id)) else: from .serialisation.transformers._helpers import process_config - model_version = getattr(process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))[0], "_commit_hash", None) + model_version = getattr( + process_config(model_id, trust_remote_code=cls.config_class.__openllm_trust_remote_code__, revision=first_not_none(model_version, default="main"))[0], "_commit_hash", None + ) if model_version is None: raise ValueError(f"Internal errors when parsing config for pretrained '{model_id}' ('commit_hash' not found)") return f"{tag_name}:{model_version}" @@ -544,7 +552,22 @@ class LLM(LLMInterface[M, T], ReprMixin): def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) - def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str, _serialisation_format: t.Literal["safetensors", "legacy"], _local: bool, **attrs: t.Any,): + def __init__( + self, + *args: t.Any, + model_id: str, + llm_config: LLMConfig, + bettertransformer: bool | None, + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, + _adapters_mapping: AdaptersMapping | None, + _tag: bentoml.Tag, + _quantize_method: t.Literal["int8", "int4", "gptq"] | None, + _runtime: t.Literal["ggml", "transformers"], + _model_version: str, + _serialisation_format: t.Literal["safetensors", "legacy"], + _local: bool, + **attrs: t.Any, + ): """Initialize the LLM with given pretrained model. > [!WARNING] @@ -641,10 +664,28 @@ class LLM(LLMInterface[M, T], ReprMixin): # parsing tokenizer and model kwargs, as the hierachy is param pass > default normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs) # NOTE: Save the args and kwargs for latter load - self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local) + self.__attrs_init__( + llm_config, + quantization_config, + model_id, + _runtime, + args, { + **model_kwds, **normalized_model_kwds + }, { + **tokenizer_kwds, **normalized_tokenizer_kwds + }, + _tag, + _adapters_mapping, + _model_version, + _quantize_method, + _serialisation_format, + _local + ) # handle trust_remote_code _from_env = os.getenv("TRUST_REMOTE_CODE", None) - self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"])) + self.__llm_trust_remote_code__ = first_not_none( + str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]) + ) self.llm_post_init() # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init @@ -654,7 +695,10 @@ class LLM(LLMInterface[M, T], ReprMixin): if _adapters_mapping and self.bettertransformer is True: self.bettertransformer = False def __setattr__(self, attr: str, value: t.Any) -> None: - if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.") + if attr in _reserved_namespace: + raise ForbiddenAttributeError( + f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}." + ) super().__setattr__(attr, value) @property @@ -704,7 +748,15 @@ class LLM(LLMInterface[M, T], ReprMixin): return self._tag def ensure_model_id_exists(self) -> bentoml.Model: - return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format) + return openllm.import_model( + self.config["start_name"], + model_id=self.model_id, + model_version=self._model_version, + runtime=self.runtime, + implementation=self.__llm_implementation__, + quantize=self._quantize_method, + serialisation_format=self._serialisation_format + ) @property def _bentomodel(self) -> bentoml.Model: @@ -747,7 +799,9 @@ class LLM(LLMInterface[M, T], ReprMixin): try: model = model.to("cuda") except Exception as err: - raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err + raise OpenLLMException( + f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information." + ) from err self.__llm_model__ = model return self.__llm_model__ @@ -758,7 +812,9 @@ class LLM(LLMInterface[M, T], ReprMixin): return self.__llm_tokenizer__ def _default_ft_config(self, _adapter_type: AdapterType, inference_mode: bool) -> FineTuneConfig: - strategy = first_not_none(self.config["fine_tune_strategies"].get(_adapter_type), default=FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), llm_config_class=self.config_class)) + strategy = first_not_none( + self.config["fine_tune_strategies"].get(_adapter_type), default=FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), llm_config_class=self.config_class) + ) return strategy.eval() if inference_mode else strategy.train() def _transpose_adapter_mapping(self, inference_mode: bool = True, use_cache: bool = True) -> ResolvedAdaptersMapping: @@ -773,19 +829,24 @@ class LLM(LLMInterface[M, T], ReprMixin): for _adapter_type, _adapters_tuples in self._adapters_mapping.items(): default_config = self._default_ft_config(_adapter_type, inference_mode) for adapter in _adapters_tuples: - if not adapter.name and _converted_first_none: raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}") + if not adapter.name and _converted_first_none: + raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}") name = adapter.name if name is None: _converted_first_none = True name = "default" - peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == "default" else FineTuneConfig(adapter_type=t.cast("PeftType", _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class).to_peft_config() + peft_config = default_config.with_config(**adapter.config).to_peft_config() if name == "default" else FineTuneConfig( + adapter_type=t.cast("PeftType", _adapter_type), adapter_config=adapter.config, inference_mode=inference_mode, llm_config_class=self.config_class + ).to_peft_config() adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id) if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map return adapter_map def prepare_for_training(self, adapter_type: AdapterType = "lora", use_gradient_checkpointing: bool = True, **attrs: t.Any) -> tuple[peft.PeftModel, T]: from peft import prepare_model_for_kbit_training - peft_config = self.config["fine_tune_strategies"].get(adapter_type, FineTuneConfig(adapter_type=t.cast("PeftType", adapter_type), llm_config_class=self.config_class)).train().with_config(**attrs).to_peft_config() + peft_config = self.config["fine_tune_strategies"].get(adapter_type, FineTuneConfig(adapter_type=t.cast("PeftType", adapter_type), llm_config_class=self.config_class)).train().with_config( + **attrs + ).to_peft_config() wrapped_peft = peft.get_peft_model(prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checkpointing), peft_config) if DEBUG: wrapped_peft.print_trainable_parameters() return wrapped_peft, self.tokenizer @@ -846,7 +907,13 @@ class LLM(LLMInterface[M, T], ReprMixin): # order of these fields matter here, make sure to sync it with # openllm.models.auto.factory.BaseAutoLLMClass.for_model - def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]: + def to_runner( + self, + models: list[bentoml.Model] | None = None, + max_batch_size: int | None = None, + max_latency_ms: int | None = None, + scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy + ) -> LLMRunner[M, T]: """Convert this LLM into a Runner. Args: @@ -879,7 +946,18 @@ class LLM(LLMInterface[M, T], ReprMixin): generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) # NOTE: returning the two langchain API's to the runner - return llm_runner_class(self)(llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), name=self.runner_name, embedded=False, models=models, max_batch_size=max_batch_size, max_latency_ms=max_latency_ms, method_configs=bentoml_cattr.unstructure({"embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig}), scheduling_strategy=scheduling_strategy,) + return llm_runner_class(self)( + llm_runnable_class(self, embeddings_sig, generate_sig, generate_iterator_sig), + name=self.runner_name, + embedded=False, + models=models, + max_batch_size=max_batch_size, + max_latency_ms=max_latency_ms, + method_configs=bentoml_cattr.unstructure({ + "embeddings": embeddings_sig, "__call__": generate_sig, "generate": generate_sig, "generate_one": generate_sig, "generate_iterator": generate_iterator_sig + }), + scheduling_strategy=scheduling_strategy, + ) # NOTE: Scikit API def predict(self, prompt: str, **attrs: t.Any) -> t.Any: @@ -908,7 +986,18 @@ class LLM(LLMInterface[M, T], ReprMixin): pass return [it] - def generate_iterator(self, prompt: str, /, *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]: + def generate_iterator( + self, + prompt: str, + /, + *, + context_length: int | None = None, + echo: bool = True, + stream_interval: int = 2, + stop: str | t.Iterable[str] | None = None, + stop_token_ids: list[int] | None = None, + **attrs: t.Any + ) -> t.Iterator[t.Any]: # NOTE: encoder-decoder models will need to implement their own generate_iterator for now # inspired from fastchat's generate_stream_func from ._generation import prepare_logits_processor, get_context_length, is_partial_stop @@ -937,7 +1026,8 @@ class LLM(LLMInterface[M, T], ReprMixin): logits = out.logits past_key_values = out.past_key_values - last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device) if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :] + last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device) + if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :] # Switch to CPU by avoiding some bugs in mps backend. if self.device.type == "mps": last_token_logits = last_token_logits.float().to("cpu") diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index dc7fb816..0de8acae 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -24,14 +24,27 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo int8_skip_modules: list[str] | None = attrs.pop("llm_int8_skip_modules", None) int8_has_fp16_weight = attrs.pop("llm_int8_has_fp16_weight", False) - autogptq_attrs: DictStrAny = {"bits": attrs.pop("gptq_bits", 4), "group_size": attrs.pop("gptq_group_size", -1), "damp_percent": attrs.pop("gptq_damp_percent", 0.01), "desc_act": attrs.pop("gptq_desc_act", True), "sym": attrs.pop("gptq_sym", True), "true_sequential": attrs.pop("gptq_true_sequential", True),} + autogptq_attrs: DictStrAny = { + "bits": attrs.pop("gptq_bits", 4), + "group_size": attrs.pop("gptq_group_size", -1), + "damp_percent": attrs.pop("gptq_damp_percent", 0.01), + "desc_act": attrs.pop("gptq_desc_act", True), + "sym": attrs.pop("gptq_sym", True), + "true_sequential": attrs.pop("gptq_true_sequential", True), + } def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig: if int8_skip_modules is None: int8_skip_modules = [] if "lm_head" not in int8_skip_modules and cls.config_class.__openllm_model_type__ == "causal_lm": logger.debug("Skipping 'lm_head' for quantization for %s", cls.__name__) int8_skip_modules.append("lm_head") - return transformers.BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload, llm_int8_threshhold=int8_threshold, llm_int8_skip_modules=int8_skip_modules, llm_int8_has_fp16_weight=int8_has_fp16_weight,) + return transformers.BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=int8_enable_fp32_cpu_offload, + llm_int8_threshhold=int8_threshold, + llm_int8_skip_modules=int8_skip_modules, + llm_int8_has_fp16_weight=int8_has_fp16_weight, + ) # 4 bit configuration int4_compute_dtype = attrs.pop("bnb_4bit_compute_dtype", torch.bfloat16) @@ -44,13 +57,21 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo if not is_bitsandbytes_available(): raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'") if quantise == "int8": quantisation_config = create_int8_config(int8_skip_modules) elif quantise == "int4": - if is_transformers_supports_kbit(): quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant) + if is_transformers_supports_kbit(): + quantisation_config = transformers.BitsAndBytesConfig( + load_in_4bit=True, bnb_4bit_compute_dtype=int4_compute_dtype, bnb_4bit_quant_type=int4_quant_type, bnb_4bit_use_double_quant=int4_use_double_quant + ) else: - logger.warning("'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.", pkg.pkg_version_info("transformers")) + logger.warning( + "'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.", + pkg.pkg_version_info("transformers") + ) quantisation_config = create_int8_config(int8_skip_modules) elif quantise == "gptq": if not is_autogptq_available(): - logger.warning("'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes.") + logger.warning( + "'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes." + ) quantisation_config = create_int8_config(int8_skip_modules) else: quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs) diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 4df93135..c347fd11 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,13 +1,15 @@ -# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract" +# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract,type-arg,valid-type,arg-type" from __future__ import annotations import os, warnings, orjson, bentoml, openllm, openllm_core, typing as t from starlette.applications import Starlette from starlette.responses import JSONResponse from starlette.routing import Route if t.TYPE_CHECKING: + from openllm_core._typing_compat import TypeAlias from starlette.requests import Request from starlette.responses import Response from bentoml._internal.runner.runner import RunnerMethod, AbstractRunner + _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], t.Sequence[openllm.LLMEmbeddings]] # The following warnings from bitsandbytes, and probably not that important for users to see warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization") warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization") @@ -16,7 +18,13 @@ model = os.environ.get("OPENLLM_MODEL", "{__model_name__}") # openllm: model na adapter_map = os.environ.get("OPENLLM_ADAPTER_MAP", """{__model_adapter_map__}""") # openllm: model adapter map llm_config = openllm.AutoConfig.for_model(model) runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map)) -generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name="llm-generic-embedding", scheduling_strategy=openllm_core.CascadingResourceStrategy, max_batch_size=32, max_latency_ms=300) # type: ignore[arg-type] # XXX: remove once bentoml.Runner is correct set with type. +generic_embedding_runner = bentoml.Runner( + openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type + name="llm-generic-embedding", + scheduling_strategy=openllm_core.CascadingResourceStrategy, + max_batch_size=32, + max_latency_ms=300 +) runners: list[AbstractRunner] = [runner] if not runner.supports_embeddings: runners.append(generic_embedding_runner) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners) @@ -31,9 +39,29 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **qa_inputs.llm_config.model_dump()) -@svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent})) +@svc.api( + route="/v1/metadata", + input=bentoml.io.Text(), + output=bentoml.io.JSON.from_sample({ + "model_id": runner.llm.model_id, + "timeout": 3600, + "model_name": llm_config["model_name"], + "framework": "pt", + "configuration": "", + "supports_embeddings": runner.supports_embeddings, + "supports_hf_agent": runner.supports_hf_agent + }) +) def metadata_v1(_: str) -> openllm.MetadataOutput: - return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent) + return openllm.MetadataOutput( + timeout=llm_config["timeout"], + model_name=llm_config["model_name"], + framework=llm_config["env"]["framework_value"], + model_id=runner.llm.model_id, + configuration=llm_config.model_dump_json().decode(), + supports_embeddings=runner.supports_embeddings, + supports_hf_agent=runner.supports_hf_agent + ) @svc.api( route="/v1/embeddings", input=bentoml.io.JSON.from_sample(["Hey Jude, welcome to the jungle!", "What is the meaning of life?"]), @@ -70,7 +98,7 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: }) ) async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput: - embed_call: RunnerMethod[bentoml.Runnable | openllm.LLMRunnable[t.Any, t.Any], [list[str]], t.Sequence[openllm.LLMEmbeddings]] = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type] + embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type] responses = (await embed_call.async_run(phrases))[0] return openllm.EmbeddingsOutput(embeddings=responses["embeddings"], num_tokens=responses["num_tokens"]) if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py index 16517ef2..520c92ec 100644 --- a/openllm-python/src/openllm/bundle/__init__.py +++ b/openllm-python/src/openllm/bundle/__init__.py @@ -5,7 +5,10 @@ These utilities will stay internal, and its API can be changed or updated withou from __future__ import annotations import os, typing as t from openllm_core.utils import LazyModule -_import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]} +_import_structure: dict[str, list[str]] = { + "_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], + "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"] +} if t.TYPE_CHECKING: from . import _package as _package, oci as oci diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index db98f1aa..02b6a53a 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -54,7 +54,18 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d packages.extend([importlib.metadata.version("flax"), importlib.metadata.version("jax"), importlib.metadata.version("jaxlib")]) elif framework_envvar == "tf": if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'") - candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos",) + candidates = ( + "tensorflow", + "tensorflow-cpu", + "tensorflow-gpu", + "tf-nightly", + "tf-nightly-cpu", + "tf-nightly-gpu", + "intel-tensorflow", + "intel-tensorflow-avx512", + "tensorflow-rocm", + "tensorflow-macos", + ) # For the metadata, we have to look for both tensorflow and tensorflow-cpu for candidate in candidates: try: @@ -70,15 +81,39 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d if not openllm_core.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.") packages.extend([f'torch>={importlib.metadata.version("torch")}']) wheels: list[str] = [] - built_wheels: list[str | None] = [build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")] + built_wheels: list[str | None] = [ + build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm") + ] if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)]) return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"]) -def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: +def construct_docker_options( + llm: openllm.LLM[t.Any, t.Any], + _: FS, + workers_per_resource: float, + quantize: LiteralString | None, + bettertransformer: bool | None, + adapter_map: dict[str, str | None] | None, + dockerfile_template: str | None, + runtime: t.Literal["ggml", "transformers"], + serialisation_format: t.Literal["safetensors", "legacy"], + container_registry: LiteralContainerRegistry, + container_version_strategy: LiteralContainerVersionStrategy +) -> DockerOptions: from openllm.cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy()) env: openllm_core.utils.EnvVarMixin = llm.config["env"] if env["framework_value"] == "vllm": serialisation_format = "legacy" - env_dict = {env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'", env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}", "OPENLLM_MODEL": llm.config["model_name"], "OPENLLM_SERIALIZATION": serialisation_format, "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", "BENTOML_DEBUG": str(True), "BENTOML_QUIET": str(False), "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'",} + env_dict = { + env.framework: env["framework_value"], + env.config: f"'{llm.config.model_dump_json().decode()}'", + env.model_id: f"/home/bentoml/bento/models/{llm.tag.path()}", + "OPENLLM_MODEL": llm.config["model_name"], + "OPENLLM_SERIALIZATION": serialisation_format, + "OPENLLM_ADAPTER_MAP": f"'{orjson.dumps(adapter_map).decode()}'", + "BENTOML_DEBUG": str(True), + "BENTOML_QUIET": str(False), + "BENTOML_CONFIG_OPTIONS": f"'{environ['BENTOML_CONFIG_OPTIONS']}'", + } if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1") # We need to handle None separately here, as env from subprocess doesn't accept None value. @@ -120,7 +155,8 @@ def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | N src_contents = f.readlines() for it in src_contents: if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n") - elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n") + elif OPENLLM_MODEL_ADAPTER_MAP in it: + src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n") script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents) if DEBUG: logger.info("Generated script:\n%s", script) llm_fs.writetext(llm.config["service_name"], script) @@ -170,7 +206,9 @@ def create_bento( exclude=["/venv", "/.venv", "__pycache__/", "*.py[cod]", "*$py.class"], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec], - docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy) + docker=construct_docker_options( + llm, llm_fs, workers_per_resource, quantize, bettertransformer, adapter_map, dockerfile_template, runtime, serialisation_format, container_registry, container_version_strategy + ) ) bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath("/")) diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index 52bc2bdd..2bb2d274 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -45,7 +45,9 @@ def nightly_resolver(cls: type[RefResolver]) -> str: # If docker is not found, then fallback to previous behaviour. Which the container might not exists. docker_bin = shutil.which("docker") if docker_bin is None: - logger.warning("To get the correct available nightly container, make sure to have docker available. Fallback to previous behaviour for determine nightly hash (container might not exists due to the lack of GPU machine at a time. See https://github.com/bentoml/OpenLLM/pkgs/container/openllm for available image.)") + logger.warning( + "To get the correct available nightly container, make sure to have docker available. Fallback to previous behaviour for determine nightly hash (container might not exists due to the lack of GPU machine at a time. See https://github.com/bentoml/OpenLLM/pkgs/container/openllm for available image.)" + ) commits = t.cast("list[dict[str, t.Any]]", cls._ghapi.repos.list_commits(since=_commit_time_range())) return next(f'sha-{it["sha"][:7]}' for it in commits if "[skip ci]" not in it["commit"]["message"]) # now is the correct behaviour @@ -71,7 +73,8 @@ class RefResolver: version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str) else: version = ("", version_str) - if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'") + if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): + raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'") return _RefTuple((*version, "release" if _use_base_strategy else "custom")) @classmethod @@ -96,7 +99,12 @@ class RefResolver: @functools.lru_cache(maxsize=256) def get_base_container_tag(strategy: LiteralContainerVersionStrategy | None = None) -> str: return RefResolver.from_strategy(strategy).tag -def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, version_strategy: LiteralContainerVersionStrategy = "release", push: bool = False, machine: bool = False) -> dict[str | LiteralContainerRegistry, str]: +def build_container( + registries: LiteralContainerRegistry | t.Sequence[LiteralContainerRegistry] | None = None, + version_strategy: LiteralContainerVersionStrategy = "release", + push: bool = False, + machine: bool = False +) -> dict[str | LiteralContainerRegistry, str]: try: if not _BUILDER.health(): raise openllm.exceptions.Error except (openllm.exceptions.Error, subprocess.CalledProcessError): @@ -106,12 +114,22 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)") pyproject_path = pathlib.Path(_module_location).parent.parent / "pyproject.toml" if not pyproject_path.exists(): raise ValueError("This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'") - if not registries: tags: dict[str | LiteralContainerRegistry, str] = {alias: f"{value}:{get_base_container_tag(version_strategy)}" for alias, value in _CONTAINER_REGISTRY.items()} # default to all registries with latest tag strategy + if not registries: + tags: dict[str | LiteralContainerRegistry, str] = { + alias: f"{value}:{get_base_container_tag(version_strategy)}" for alias, value in _CONTAINER_REGISTRY.items() + } # default to all registries with latest tag strategy else: registries = [registries] if isinstance(registries, str) else list(registries) tags = {name: f"{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}" for name in registries} try: - outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm_core.utils.get_debug_mode() else "auto", quiet=machine) + outputs = _BUILDER.build( + file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), + context_path=pyproject_path.parent.__fspath__(), + tag=tuple(tags.values()), + push=push, + progress="plain" if openllm_core.utils.get_debug_mode() else "auto", + quiet=machine + ) if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip() except Exception as err: raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index 8c5e9f84..e180f0ce 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -25,7 +25,12 @@ def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny: # TODO: Support amd.com/gpu on k8s _bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "") - _bentoml_config_options_opts = ["tracing.sample_rate=1.0", f"api_server.traffic.timeout={server_timeout}", f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}'] + _bentoml_config_options_opts = [ + "tracing.sample_rate=1.0", + f"api_server.traffic.timeout={server_timeout}", + f'runners."llm-{config["start_name"]}-runner".traffic.timeout={config["timeout"]}', + f'runners."llm-{config["start_name"]}-runner".workers_per_resource={workers_per_resource}' + ] if device: if len(device) > 1: _bentoml_config_options_opts.extend([f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)]) else: _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]') @@ -81,17 +86,38 @@ Available official model_id(s): [default: {llm_config['default_id']}] if llm_config["requires_gpu"] and openllm.utils.device_count() < 1: # NOTE: The model requires GPU, therefore we will return a dummy command - command_attrs.update({"short_help": "(Disabled because there is no GPU available)", "help": f"{model} is currently not available to run on your local machine because it requires GPU for inference."}) + command_attrs.update({ + "short_help": "(Disabled because there is no GPU available)", "help": f"{model} is currently not available to run on your local machine because it requires GPU for inference." + }) return noop_command(group, llm_config, _serve_grpc, **command_attrs) @group.command(**command_attrs) @start_decorator(llm_config, serve_grpc=_serve_grpc) @click.pass_context - def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, device: t.Tuple[str, ...], quantize: t.Literal["int8", "int4", "gptq"] | None, bettertransformer: bool | None, runtime: t.Literal["ggml", "transformers"], fast: bool, serialisation_format: t.Literal["safetensors", "legacy"], cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, - ) -> LLMConfig | subprocess.Popen[bytes]: + def start_cmd( + ctx: click.Context, + /, + server_timeout: int, + model_id: str | None, + model_version: str | None, + workers_per_resource: t.Literal["conserved", "round_robin"] | LiteralString, + device: t.Tuple[str, ...], + quantize: t.Literal["int8", "int4", "gptq"] | None, + bettertransformer: bool | None, + runtime: t.Literal["ggml", "transformers"], + fast: bool, + serialisation_format: t.Literal["safetensors", "legacy"], + cors: bool, + adapter_id: str | None, + return_process: bool, + **attrs: t.Any, + ) -> LLMConfig | subprocess.Popen[bytes]: fast = str(fast).upper() in openllm.utils.ENV_VARS_TRUE_VALUES if serialisation_format == "safetensors" and quantize is not None and os.environ.get("OPENLLM_SERIALIZATION_WARNING", str(True)).upper() in openllm.utils.ENV_VARS_TRUE_VALUES: - termui.echo(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg="yellow") + termui.echo( + f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", + fg="yellow" + ) adapter_map: dict[str, str | None] | None = attrs.pop(_adapter_mapping_key, None) config, server_attrs = llm_config.model_validate_click(**attrs) server_timeout = openllm.utils.first_not_none(server_timeout, default=config["timeout"]) @@ -117,21 +143,34 @@ Available official model_id(s): [default: {llm_config['default_id']}] wpr = float(wpr) # Create a new model env to work with the envvar during CLI invocation - env = openllm.utils.EnvVarMixin(config["model_name"], config.default_implementation(), model_id=model_id or config["default_id"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime) + env = openllm.utils.EnvVarMixin( + config["model_name"], config.default_implementation(), model_id=model_id or config["default_id"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime + ) prerequisite_check(ctx, config, quantize, adapter_map, int(1 / wpr)) # NOTE: This is to set current configuration start_env = os.environ.copy() start_env = parse_config_options(config, server_timeout, wpr, device, cors, start_env) - if fast: termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow") + if fast: + termui.echo(f"Fast mode is enabled. Make sure the model is available in local store before 'start': 'openllm import {model}{' --model-id ' + model_id if model_id else ''}'", fg="yellow") - start_env.update({"OPENLLM_MODEL": model, "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), "OPENLLM_SERIALIZATION": serialisation_format, env.runtime: env["runtime_value"], env.framework: env["framework_value"]}) + start_env.update({ + "OPENLLM_MODEL": model, + "BENTOML_DEBUG": str(openllm.utils.get_debug_mode()), + "BENTOML_HOME": os.environ.get("BENTOML_HOME", BentoMLContainer.bentoml_home.get()), + "OPENLLM_ADAPTER_MAP": orjson.dumps(adapter_map).decode(), + "OPENLLM_SERIALIZATION": serialisation_format, + env.runtime: env["runtime_value"], + env.framework: env["framework_value"] + }) if env["model_id_value"]: start_env[env.model_id] = str(env["model_id_value"]) # NOTE: quantize and bettertransformer value is already assigned within env if bettertransformer is not None: start_env[env.bettertransformer] = str(env["bettertransformer_value"]) if quantize is not None: start_env[env.quantize] = str(t.cast(str, env["quantize_value"])) - llm = openllm.utils.infer_auto_class(env["framework_value"]).for_model(model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format) + llm = openllm.utils.infer_auto_class(env["framework_value"]).for_model( + model, model_id=start_env[env.model_id], model_version=model_version, llm_config=config, ensure_available=not fast, adapter_map=adapter_map, serialisation=serialisation_format + ) start_env.update({env.config: llm.config.model_dump_json().decode()}) server = bentoml.GrpcServer("_service:svc", **server_attrs) if _serve_grpc else bentoml.HTTPServer("_service:svc", **server_attrs) @@ -174,7 +213,8 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, * return noop def prerequisite_check(ctx: click.Context, llm_config: LLMConfig, quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, num_workers: int) -> None: if adapter_map and not openllm.utils.is_peft_available(): ctx.fail("Using adapter requires 'peft' to be available. Make sure to install with 'pip install \"openllm[fine-tune]\"'") - if quantize and llm_config.default_implementation() == "vllm": ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.") + if quantize and llm_config.default_implementation() == "vllm": + ctx.fail(f"Quantization is not yet supported with vLLM. Set '{llm_config['env']['framework']}=\"pt\"' to run with quantization.") requirements = llm_config["requirements"] if requirements is not None and len(requirements) > 0: missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None] @@ -204,12 +244,22 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) """, ), - cog.optgroup.option("--device", type=openllm.utils.dantic.CUDA, multiple=True, envvar="CUDA_VISIBLE_DEVICES", callback=parse_device_callback, help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", show_envvar=True), + cog.optgroup.option( + "--device", + type=openllm.utils.dantic.CUDA, + multiple=True, + envvar="CUDA_VISIBLE_DEVICES", + callback=parse_device_callback, + help=f"Assign GPU devices (if available) for {llm_config['model_name']}.", + show_envvar=True + ), cog.optgroup.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers."), quantize_option(factory=cog.optgroup, model_env=llm_config["env"]), bettertransformer_option(factory=cog.optgroup, model_env=llm_config["env"]), serialisation_option(factory=cog.optgroup), - cog.optgroup.group("Fine-tuning related options", help="""\ + cog.optgroup.group( + "Fine-tuning related options", + help="""\ Note that the argument `--adapter-id` can accept the following format: - `--adapter-id /path/to/adapter` (local adapter) @@ -223,8 +273,16 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab $ openllm start opt --adapter-id /path/to/adapter_dir --adapter-id remote/adapter:eng_lora ``` - """), - cog.optgroup.option("--adapter-id", default=None, help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'", multiple=True, callback=_id_callback, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]"), + """ + ), + cog.optgroup.option( + "--adapter-id", + default=None, + help="Optional name or path for given LoRA adapter" + f" to wrap '{llm_config['model_name']}'", + multiple=True, + callback=_id_callback, + metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]" + ), click.option("--return-process", is_flag=True, default=False, help="Internal use only.", hidden=True), ) return composed(fn) @@ -246,7 +304,9 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig] from bentoml_cli.cli import cli command = "serve" if not serve_grpc else "serve-grpc" - group = cog.optgroup.group(f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]",) + group = cog.optgroup.group( + f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]", + ) def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]: serve_command = cli.commands[command] @@ -291,18 +351,46 @@ def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]: return [CompletionItem(it) for it in output] - return cli_option("-o", "--output", "output", type=click.Choice(output), default=default_value, help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True, shell_complete=complete_output_var, **attrs)(f) + return cli_option( + "-o", + "--output", + "output", + type=click.Choice(output), + default=default_value, + help="Showing output type.", + show_default=True, + envvar="OPENLLM_OUTPUT", + show_envvar=True, + shell_complete=complete_output_var, + **attrs + )(f) def fast_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option("--fast/--no-fast", show_default=True, default=False, envvar="OPENLLM_USE_LOCAL_LATEST", show_envvar=True, help="""Whether to skip checking if models is already in store. + return cli_option( + "--fast/--no-fast", + show_default=True, + default=False, + envvar="OPENLLM_USE_LOCAL_LATEST", + show_envvar=True, + help="""Whether to skip checking if models is already in store. This is useful if you already downloaded or setup the model beforehand. - """, **attrs)(f) + """, + **attrs + )(f) def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--cors/--no-cors", show_default=True, default=False, envvar="OPENLLM_CORS", show_envvar=True, help="Enable CORS for the server.", **attrs)(f) def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f) def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f) + return cli_option( + "--model-id", + type=click.STRING, + default=None, + envvar=model_env.model_id if model_env is not None else None, + show_envvar=model_env is not None, + help="Optional model_id name or path for (fine-tune) weight.", + **attrs + )(f) def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f) def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: @@ -349,14 +437,25 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = - ``round_robin``: Similar behaviour when setting ``--workers-per-resource 1``. This is useful for smaller models. - ``conserved``: This will determine the number of available GPU resources, and only assign one worker for the LLMRunner. For example, if ther are 4 GPUs available, then ``conserved`` is equivalent to ``--workers-per-resource 0.25``. - """ + ("""\n + """ + ( + """\n > [!NOTE] The workers value passed into 'build' will determine how the LLM can > be provisioned in Kubernetes as well as in standalone container. This will - > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else ""), + > ensure it has the same effect with 'openllm start --api-workers ...'""" if build else "" + ), **attrs )(f) def bettertransformer_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option("--bettertransformer", is_flag=True, default=None, envvar=model_env.bettertransformer if model_env is not None else None, show_envvar=model_env is not None, help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", **attrs)(f) + return cli_option( + "--bettertransformer", + is_flag=True, + default=None, + envvar=model_env.bettertransformer if model_env is not None else None, + show_envvar=model_env is not None, + help="Apply FasterTransformer wrapper to serve model. This will applies during serving time." + if not build else "Set default environment variable whether to serve this model with FasterTransformer in build time.", + **attrs + )(f) def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option( "--serialisation", diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index 5f38ca18..571a02bd 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -71,7 +71,14 @@ def _start( """ from .entrypoint import start_command, start_grpc_command llm_config = openllm.AutoConfig.for_model(model_name) - _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime) + _ModelEnv = openllm_core.utils.EnvVarMixin( + model_name, + openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), + model_id=model_id, + bettertransformer=bettertransformer, + quantize=quantize, + runtime=runtime + ) os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"] args: list[str] = ["--runtime", runtime] @@ -87,7 +94,9 @@ def _start( if additional_args: args.extend(additional_args) if __test__: args.append("--return-process") - return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False) + return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=_serve_grpc).main( + args=args if len(args) > 0 else None, standalone_mode=False + ) @inject def _build( model_name: str, @@ -190,9 +199,21 @@ def _build( if e.stderr: raise OpenLLMException(e.stderr.decode("utf-8")) from None raise OpenLLMException(str(e)) from None matched = re.match(r"__tag__:([^:\n]+:[^:\n]+)$", output.decode("utf-8").strip()) - if matched is None: raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") + if matched is None: + raise ValueError(f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub.") return bentoml.get(matched.group(1), _bento_store=bento_store) -def _import_model(model_name: str, /, *, model_id: str | None = None, model_version: str | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", implementation: LiteralRuntime = "pt", quantize: t.Literal["int8", "int4", "gptq"] | None = None, serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors", additional_args: t.Sequence[str] | None = None) -> bentoml.Model: +def _import_model( + model_name: str, + /, + *, + model_id: str | None = None, + model_version: str | None = None, + runtime: t.Literal["ggml", "transformers"] = "transformers", + implementation: LiteralRuntime = "pt", + quantize: t.Literal["int8", "int4", "gptq"] | None = None, + serialisation_format: t.Literal["legacy", "safetensors"] = "safetensors", + additional_args: t.Sequence[str] | None = None +) -> bentoml.Model: """Import a LLM into local store. > [!NOTE] diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py index ff8adedd..bea867b1 100644 --- a/openllm-python/src/openllm/cli/entrypoint.py +++ b/openllm-python/src/openllm/cli/entrypoint.py @@ -225,7 +225,9 @@ class OpenLLMCommandGroup(BentoMLCommandGroup): with formatter.section(_("Extensions")): formatter.write_dl(rows) @click.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="openllm") -@click.version_option(None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}") +@click.version_option( + None, "--version", "-v", message=f"%(prog)s, %(version)s (compiled: {'yes' if openllm.COMPILED else 'no'})\nPython ({platform.python_implementation()}) {platform.python_version()}" +) def cli() -> None: """\b ██████╗ ██████╗ ███████╗███╗ ██╗██╗ ██╗ ███╗ ███╗ @@ -257,7 +259,14 @@ def start_grpc_command() -> None: $ openllm start-grpc -- ... ``` """ -_start_mapping = {"start": {key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING}, "start-grpc": {key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING}} +_start_mapping = { + "start": { + key: start_command_factory(start_command, key, _context_settings=termui.CONTEXT_SETTINGS) for key in CONFIG_MAPPING + }, + "start-grpc": { + key: start_command_factory(start_grpc_command, key, _context_settings=termui.CONTEXT_SETTINGS, _serve_grpc=True) for key in CONFIG_MAPPING + } +} @cli.command(name="import", aliases=["download"]) @model_name_argument @click.argument("model_id", type=click.STRING, default=None, metavar="Optional[REMOTE_REPO/MODEL_ID | /path/to/local/model]", required=False) @@ -269,7 +278,18 @@ _start_mapping = {"start": {key: start_command_factory(start_command, key, _cont @machine_option @click.option("--implementation", type=click.Choice(["pt", "tf", "flax", "vllm"]), default=None, help="The implementation for saving this LLM.") @serialisation_option -def import_command(model_name: str, model_id: str | None, converter: str | None, model_version: str | None, output: LiteralOutput, runtime: t.Literal["ggml", "transformers"], machine: bool, implementation: LiteralRuntime | None, quantize: t.Literal["int8", "int4", "gptq"] | None, serialisation_format: t.Literal["safetensors", "legacy"],) -> bentoml.Model: +def import_command( + model_name: str, + model_id: str | None, + converter: str | None, + model_version: str | None, + output: LiteralOutput, + runtime: t.Literal["ggml", "transformers"], + machine: bool, + implementation: LiteralRuntime | None, + quantize: t.Literal["int8", "int4", "gptq"] | None, + serialisation_format: t.Literal["safetensors", "legacy"], +) -> bentoml.Model: """Setup LLM interactively. It accepts two positional arguments: `model_name` and `model_id`. The first name determine @@ -325,7 +345,9 @@ def import_command(model_name: str, model_id: str | None, converter: str | None, llm_config = AutoConfig.for_model(model_name) env = EnvVarMixin(model_name, llm_config.default_implementation(), model_id=model_id, runtime=runtime, quantize=quantize) impl: LiteralRuntime = first_not_none(implementation, default=env["framework_value"]) - llm = infer_auto_class(impl).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format) + llm = infer_auto_class(impl).for_model( + model_name, model_id=env["model_id_value"], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format + ) _previously_saved = False try: _ref = serialisation.get(llm) @@ -356,17 +378,37 @@ def import_command(model_name: str, model_id: str | None, converter: str | None, @quantize_option(factory=cog.optgroup, build=True) @bettertransformer_option(factory=cog.optgroup) @click.option("--runtime", type=click.Choice(["ggml", "transformers"]), default="transformers", help="The runtime to use for the given model. Default is transformers.") -@click.option("--enable-features", multiple=True, nargs=1, metavar="FEATURE[,FEATURE]", help="Enable additional features for building this LLM Bento. Available: {}".format(", ".join(OPTIONAL_DEPENDENCIES))) -@click.option("--adapter-id", default=None, multiple=True, metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]", help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed.") +@click.option( + "--enable-features", + multiple=True, + nargs=1, + metavar="FEATURE[,FEATURE]", + help="Enable additional features for building this LLM Bento. Available: {}".format(", ".join(OPTIONAL_DEPENDENCIES)) +) +@click.option( + "--adapter-id", + default=None, + multiple=True, + metavar="[PATH | [remote/][adapter_name:]adapter_id][, ...]", + help="Optional adapters id to be included within the Bento. Note that if you are using relative path, '--build-ctx' must be passed." +) @click.option("--build-ctx", help="Build context. This is required if --adapter-id uses relative path", default=None) @model_version_option @click.option("--dockerfile-template", default=None, type=click.File(), help="Optional custom dockerfile template to be used with this BentoLLM.") @serialisation_option @container_registry_option -@click.option("--container-version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="release", help="Default container version strategy for the image from '--container-registry'") +@click.option( + "--container-version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="release", help="Default container version strategy for the image from '--container-registry'" +) @fast_option @cog.optgroup.group(cls=cog.MutuallyExclusiveOptionGroup, name="Utilities options") -@cog.optgroup.option("--containerize", default=False, is_flag=True, type=click.BOOL, help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'.") +@cog.optgroup.option( + "--containerize", + default=False, + is_flag=True, + type=click.BOOL, + help="Whether to containerize the Bento after building. '--containerize' is the shortcut of 'openllm build && bentoml containerize'." +) @cog.optgroup.option("--push", default=False, is_flag=True, type=click.BOOL, help="Whether to push the result bento to BentoCloud. Make sure to login with 'bentoml cloud login' first.") @click.option("--force-push", default=False, is_flag=True, type=click.BOOL, help="Whether to force push.") @click.pass_context @@ -431,7 +473,9 @@ def build_command( if env["quantize_value"]: os.environ[env.quantize] = str(env["quantize_value"]) os.environ[env.bettertransformer] = str(env["bettertransformer_value"]) - llm = infer_auto_class(env["framework_value"]).for_model(model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs) + llm = infer_auto_class(env["framework_value"]).for_model( + model_name, model_id=env["model_id_value"], llm_config=llm_config, ensure_available=not fast, model_version=model_version, serialisation=serialisation_format, **attrs + ) labels = dict(llm.identifying_params) labels.update({"_type": llm.llm_type, "_framework": env["framework_value"]}) @@ -476,7 +520,20 @@ def build_command( raise bentoml.exceptions.NotFound(f"Rebuilding existing Bento {bento_tag}") from None _previously_built = True except bentoml.exceptions.NotFound: - bento = bundle.create_bento(bento_tag, llm_fs, llm, workers_per_resource=workers_per_resource, adapter_map=adapter_map, quantize=quantize, bettertransformer=bettertransformer, extra_dependencies=enable_features, dockerfile_template=dockerfile_template_path, runtime=runtime, container_registry=container_registry, container_version_strategy=container_version_strategy) + bento = bundle.create_bento( + bento_tag, + llm_fs, + llm, + workers_per_resource=workers_per_resource, + adapter_map=adapter_map, + quantize=quantize, + bettertransformer=bettertransformer, + extra_dependencies=enable_features, + dockerfile_template=dockerfile_template_path, + runtime=runtime, + container_registry=container_registry, + container_version_strategy=container_version_strategy + ) except Exception as err: raise err from None @@ -486,7 +543,12 @@ def build_command( termui.echo("\n" + OPENLLM_FIGLET, fg="white") if not _previously_built: termui.echo(f"Successfully built {bento}.", fg="green") elif not overwrite: termui.echo(f"'{model_name}' already has a Bento built [{bento}]. To overwrite it pass '--overwrite'.", fg="yellow") - termui.echo("📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" + f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" + "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n", fg="blue",) + termui.echo( + "📖 Next steps:\n\n" + f"* Push to BentoCloud with 'bentoml push':\n\t$ bentoml push {bento.tag}\n\n" + + f"* Containerize your Bento with 'bentoml containerize':\n\t$ bentoml containerize {bento.tag} --opt progress=plain\n\n" + + "\tTip: To enable additional BentoML features for 'containerize', use '--enable-features=FEATURE[,FEATURE]' [see 'bentoml containerize -h' for more advanced usage]\n", + fg="blue", + ) elif output == "json": termui.echo(orjson.dumps(bento.info.to_dict(), option=orjson.OPT_INDENT_2).decode()) else: @@ -538,7 +600,14 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo if config["model_name"] in MODEL_FLAX_MAPPING_NAMES: runtime_impl += ("flax",) if config["model_name"] in MODEL_TF_MAPPING_NAMES: runtime_impl += ("tf",) if config["model_name"] in MODEL_VLLM_MAPPING_NAMES: runtime_impl += ("vllm",) - json_data[m] = {"architecture": config["architecture"], "model_id": config["model_ids"], "cpu": not config["requires_gpu"], "gpu": True, "runtime_impl": runtime_impl, "installation": f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config["requirements"] else "openllm",} + json_data[m] = { + "architecture": config["architecture"], + "model_id": config["model_ids"], + "cpu": not config["requires_gpu"], + "gpu": True, + "runtime_impl": runtime_impl, + "installation": f'"openllm[{m}]"' if m in OPTIONAL_DEPENDENCIES or config["requirements"] else "openllm", + } converted.extend([normalise_model_name(i) for i in config["model_ids"]]) if DEBUG: try: @@ -546,7 +615,11 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo except Exception as e: failed_initialized.append((m, e)) - ids_in_local_store = {k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k] for k in json_data.keys()} + ids_in_local_store = { + k: [ + i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k + ] for k in json_data.keys() + } ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v} local_models: DictStrAny | None = None if show_available: @@ -563,7 +636,9 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo data: list[str | tuple[str, str, list[str], str, LiteralString, LiteralString, tuple[LiteralRuntime, ...]]] = [] for m, v in json_data.items(): data.extend([(m, v["architecture"], v["model_id"], v["installation"], "❌" if not v["cpu"] else "✅", "✅", v["runtime_impl"],)]) - column_widths = [int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4),] + column_widths = [ + int(termui.COLUMNS / 12), int(termui.COLUMNS / 6), int(termui.COLUMNS / 4), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 12), int(termui.COLUMNS / 4), + ] if len(data) == 0 and len(failed_initialized) > 0: termui.echo("Exception found while parsing models:\n", fg="yellow") @@ -596,17 +671,22 @@ def models_command(ctx: click.Context, output: LiteralOutput, show_available: bo @click.option("-y", "--yes", "--assume-yes", is_flag=True, help="Skip confirmation when deleting a specific model") @click.option("--include-bentos/--no-include-bentos", is_flag=True, default=False, help="Whether to also include pruning bentos.") @inject -def prune_command(model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> None: +def prune_command( + model_name: str | None, yes: bool, include_bentos: bool, model_store: ModelStore = Provide[BentoMLContainer.model_store], bento_store: BentoStore = Provide[BentoMLContainer.bento_store] +) -> None: """Remove all saved models, (and optionally bentos) built with OpenLLM locally. \b If a model type is passed, then only prune models for that given model type. """ - available: list[tuple[bentoml.Model | bentoml.Bento, ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"] + available: list[tuple[bentoml.Model | bentoml.Bento, + ModelStore | BentoStore]] = [(m, model_store) for m in bentoml.models.list() if "framework" in m.info.labels and m.info.labels["framework"] == "openllm"] if model_name is not None: available = [(m, store) for m, store in available if "model_name" in m.info.labels and m.info.labels["model_name"] == inflection.underscore(model_name)] if include_bentos: - if model_name is not None: available += [(b, bento_store) for b in bentoml.bentos.list() if "start_name" in b.info.labels and b.info.labels["start_name"] == inflection.underscore(model_name)] - else: available += [(b, bento_store) for b in bentoml.bentos.list() if "_type" in b.info.labels and "_framework" in b.info.labels] + if model_name is not None: + available += [(b, bento_store) for b in bentoml.bentos.list() if "start_name" in b.info.labels and b.info.labels["start_name"] == inflection.underscore(model_name)] + else: + available += [(b, bento_store) for b in bentoml.bentos.list() if "_type" in b.info.labels and "_framework" in b.info.labels] for store_item, store in available: if yes: delete_confirmed = True @@ -633,15 +713,27 @@ def parsing_instruction_callback(ctx: click.Context, param: click.Parameter, val else: raise click.BadParameter(f"Invalid option format: {value}") def shared_client_options(f: _AnyCallable | None = None, output_value: t.Literal["json", "porcelain", "pretty"] = "pretty") -> t.Callable[[FC], FC]: - options = [click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000",), click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True), output_option(default_value=output_value),] + options = [ + click.option("--endpoint", type=click.STRING, help="OpenLLM Server endpoint, i.e: http://localhost:3000", envvar="OPENLLM_ENDPOINT", default="http://localhost:3000", + ), + click.option("--timeout", type=click.INT, default=30, help="Default server timeout", show_default=True), + output_option(default_value=output_value), + ] return compose(*options)(f) if f is not None else compose(*options) @cli.command() @click.argument("task", type=click.STRING, metavar="TASK") @shared_client_options @click.option("--agent", type=click.Choice(["hf"]), default="hf", help="Whether to interact with Agents from given Server endpoint.", show_default=True) @click.option("--remote", is_flag=True, default=False, help="Whether or not to use remote tools (inference endpoints) instead of local ones.", show_default=True) -@click.option("--opt", help="Define prompt options. " - "(format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]") +@click.option( + "--opt", + help="Define prompt options. " + "(format: ``--opt text='I love this' --opt audio:./path/to/audio --opt image:/path/to/file``)", + required=False, + multiple=True, + callback=opt_callback, + metavar="ARG=VALUE[,ARG=VALUE]" +) def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: LiteralOutput, remote: bool, task: str, _memoized: DictStrAny, **attrs: t.Any) -> str: """Instruct agents interactively for given tasks, from a terminal. @@ -675,7 +767,9 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output: @click.argument("text", type=click.STRING, nargs=-1) @machine_option @click.pass_context -def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, machine: bool) -> EmbeddingsOutput | None: +def embed_command( + ctx: click.Context, text: tuple[str, ...], endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, machine: bool +) -> EmbeddingsOutput | None: """Get embeddings interactively, from a terminal. \b @@ -703,9 +797,13 @@ def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, time @shared_client_options @click.option("--server-type", type=click.Choice(["grpc", "http"]), help="Server type", default="http", show_default=True) @click.argument("prompt", type=click.STRING) -@click.option("--sampling-params", help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]") +@click.option( + "--sampling-params", help="Define query options. (format: ``--opt temperature=0.8 --opt=top_k:12)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]" +) @click.pass_context -def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any) -> None: +def query_command( + ctx: click.Context, /, prompt: str, endpoint: str, timeout: int, server_type: t.Literal["http", "grpc"], output: LiteralOutput, _memoized: DictStrAny, **attrs: t.Any +) -> None: """Ask a LLM interactively, from a terminal. \b diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py index 5f6e3f15..413864d6 100644 --- a/openllm-python/src/openllm/cli/extension/get_containerfile.py +++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py @@ -31,5 +31,6 @@ def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[Bento # save it to /env/docker/Dockerfile.template. This is necessary # for the reconstruction of the Dockerfile. if "dockerfile_template" in docker_attrs and docker_attrs["dockerfile_template"] is not None: docker_attrs["dockerfile_template"] = "env/docker/Dockerfile.template" - termui.echo(generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True,), fg="white") + doc = generate_containerfile(docker=DockerOptions(**docker_attrs), build_ctx=bentomodel.path, conda=options.conda, bento_fs=bentomodel._fs, enable_buildkit=True, add_header=True) + termui.echo(doc, fg="white") return bentomodel.path diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py index 4f372f07..99721a62 100644 --- a/openllm-python/src/openllm/cli/extension/get_prompt.py +++ b/openllm-python/src/openllm/cli/extension/get_prompt.py @@ -11,7 +11,14 @@ LiteralOutput = t.Literal["json", "pretty", "porcelain"] @output_option @click.option("--format", type=click.STRING, default=None) @machine_option -@click.option("--opt", help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]") +@click.option( + "--opt", + help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", + required=False, + multiple=True, + callback=opt_callback, + metavar="ARG=VALUE[,ARG=VALUE]" +) @click.pass_context def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None: """Get the default prompt used by OpenLLM.""" diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py index 8ab9f518..23838825 100644 --- a/openllm-python/src/openllm/cli/extension/list_bentos.py +++ b/openllm-python/src/openllm/cli/extension/list_bentos.py @@ -8,12 +8,25 @@ from openllm.cli._factory import LiteralOutput, output_option @click.pass_context def cli(ctx: click.Context, output: LiteralOutput) -> None: """List available bentos built by OpenLLM.""" - mapping = {k: [{"tag": str(b.tag), "size": human_readable_size(openllm.utils.calc_dir_size(b.path)), "models": [{"tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path))} for m in (bentoml.models.get(_.tag) for _ in b.info.models)]} for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys())} + mapping = { + k: [{ + "tag": str(b.tag), + "size": human_readable_size(openllm.utils.calc_dir_size(b.path)), + "models": [{ + "tag": str(m.tag), "size": human_readable_size(openllm.utils.calc_dir_size(m.path)) + } for m in (bentoml.models.get(_.tag) for _ in b.info.models)] + } for b in tuple(i for i in bentoml.list() if all(k in i.info.labels for k in {"start_name", "bundler"})) if b.info.labels["start_name"] == k] for k in tuple( + inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys() + ) + } mapping = {k: v for k, v in mapping.items() if v} if output == "pretty": import tabulate tabulate.PRESERVE_WHITESPACE = True - termui.echo(tabulate.tabulate([(k, i["tag"], i["size"], [_["tag"] for _ in i["models"]]) for k, v in mapping.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size", "Models"]), fg="white") + termui.echo( + tabulate.tabulate([(k, i["tag"], i["size"], [_["tag"] for _ in i["models"]]) for k, v in mapping.items() for i in v], tablefmt="fancy_grid", headers=["LLM", "Tag", "Size", "Models"]), + fg="white" + ) else: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white") ctx.exit(0) diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py index 7aa1e5ff..5449dba9 100644 --- a/openllm-python/src/openllm/cli/extension/list_models.py +++ b/openllm-python/src/openllm/cli/extension/list_models.py @@ -11,8 +11,12 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny: """This is equivalent to openllm models --show-available less the nice table.""" models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) - ids_in_local_store = {k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k] for k in models} - if model_name is not None: ids_in_local_store = {k: [i for i in v if "model_name" in i.info.labels and i.info.labels["model_name"] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()} + ids_in_local_store = { + k: [i for i in bentoml.models.list() if "framework" in i.info.labels and i.info.labels["framework"] == "openllm" and "model_name" in i.info.labels and i.info.labels["model_name"] == k + ] for k in models + } + if model_name is not None: + ids_in_local_store = {k: [i for i in v if "model_name" in i.info.labels and i.info.labels["model_name"] == inflection.dasherize(model_name)] for k, v in ids_in_local_store.items()} ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v} local_models = {k: [{"tag": str(i.tag), "size": human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()} if output == "pretty": diff --git a/openllm-python/src/openllm/client.py b/openllm-python/src/openllm/client.py index 66ebdd54..d7ef5aaa 100644 --- a/openllm-python/src/openllm/client.py +++ b/openllm-python/src/openllm/client.py @@ -12,7 +12,7 @@ client.embed("What is the difference between gather and scatter?") """ from __future__ import annotations import openllm_client, typing as t -if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient +if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient def __dir__() -> t.Sequence[str]: return sorted(dir(openllm_client)) def __getattr__(it: str) -> t.Any: diff --git a/openllm-python/src/openllm/models/auto/__init__.py b/openllm-python/src/openllm/models/auto/__init__.py index 34d2b858..12dfd912 100644 --- a/openllm-python/src/openllm/models/auto/__init__.py +++ b/openllm-python/src/openllm/models/auto/__init__.py @@ -3,7 +3,12 @@ import typing as t, os import openllm from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES -_import_structure: dict[str, list[str]] = {"modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]} +_import_structure: dict[str, list[str]] = { + "modeling_auto": ["MODEL_MAPPING_NAMES"], + "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], + "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], + "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"] +} if t.TYPE_CHECKING: from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py index f6a64540..78861a74 100644 --- a/openllm-python/src/openllm/models/auto/factory.py +++ b/openllm-python/src/openllm/models/auto/factory.py @@ -23,7 +23,8 @@ class BaseAutoLLMClass: raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.") @classmethod - def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False, **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]: + def for_model(cls, model: str, /, model_id: str | None = None, model_version: str | None = None, llm_config: openllm.LLMConfig | None = None, ensure_available: bool = False, + **attrs: t.Any) -> openllm.LLM[t.Any, t.Any]: """The lower level API for creating a LLM instance. ```python @@ -62,14 +63,18 @@ class BaseAutoLLMClass: llm_class: The runnable to register. """ if hasattr(llm_class, "config_class") and llm_class.config_class is not config_class: - raise ValueError(f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!") + raise ValueError( + f"The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has {llm_class.config_class} and you passed {config_class}. Fix one of those so they match!" + ) cls._model_mapping.register(config_class, llm_class) @classmethod def infer_class_from_name(cls, name: str) -> type[openllm.LLM[t.Any, t.Any]]: config_class = openllm.AutoConfig.infer_class_from_name(name) if config_class in cls._model_mapping: return cls._model_mapping[config_class] - raise ValueError(f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])}).") + raise ValueError( + f"Unrecognized configuration class ({config_class}) for {name}. Model name should be one of {', '.join(openllm.CONFIG_MAPPING.keys())} (Registered configuration class: {', '.join([i.__name__ for i in cls._model_mapping.keys()])})." + ) def getattribute_from_module(module: types.ModuleType, attr: t.Any) -> t.Any: if attr is None: return if isinstance(attr, tuple): return tuple(getattribute_from_module(module, a) for a in attr) @@ -127,13 +132,23 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): return bool(self.keys()) def keys(self) -> ConfigModelKeysView: - return t.cast("ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys())) + return t.cast( + "ConfigModelKeysView", [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + list(self._extra_content.keys()) + ) def values(self) -> ConfigModelValuesView: - return t.cast("ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list(self._extra_content.values())) + return t.cast( + "ConfigModelValuesView", [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + list( + self._extra_content.values() + ) + ) def items(self) -> ConfigModelItemsView: - return t.cast("ConfigModelItemsView", [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items())) + return t.cast( + "ConfigModelItemsView", + [(self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key])) for key in self._model_mapping.keys() if key in self._config_mapping.keys()] + list(self._extra_content.items()) + ) def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]: return iter(t.cast("SupportsIter[t.Iterator[type[openllm.LLMConfig]]]", self.keys())) diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py index f6a12d3e..4ad482e3 100644 --- a/openllm-python/src/openllm/models/auto/modeling_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_auto.py @@ -3,7 +3,9 @@ import typing as t from collections import OrderedDict from .factory import BaseAutoLLMClass, _LazyAutoMapping from openllm_core.config import CONFIG_MAPPING_NAMES -MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")]) +MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ( + "opt", "OPT" +), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")]) MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) class AutoLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_MAPPING diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py index 516e737b..855a1fb3 100644 --- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py @@ -3,7 +3,9 @@ import typing as t from collections import OrderedDict from .factory import BaseAutoLLMClass, _LazyAutoMapping from openllm_core.config import CONFIG_MAPPING_NAMES -MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")]) +MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ( + "opt", "VLLMOPT" +), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")]) MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES) class AutoVLLM(BaseAutoLLMClass): _model_mapping: t.ClassVar = MODEL_VLLM_MAPPING diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py index d63b59ba..e3894f63 100644 --- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -4,7 +4,7 @@ from openllm_core._typing_compat import overload from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf -else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow") +else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow") logger = logging.getLogger(__name__) @overload def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizer, _init: t.Literal[True] = True, **attrs: t.Any) -> transformers.Pipeline: @@ -52,7 +52,12 @@ def get_pipeline(model: transformers.PreTrainedModel, tokenizer: transformers.Pr input_ids, attention_mask = input_tensors["input_ids"], input_tensors.get("attention_mask", None) if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1 else: in_b = input_ids.shape[0] - generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, pad_token_id=self.tokenizer.pad_token_id, **generate_kwargs) + generated_sequence = self.model.generate( + input_ids=input_ids.to(self.model.device) if input_ids is not None else None, + attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, + pad_token_id=self.tokenizer.pad_token_id, + **generate_kwargs + ) out_b = generated_sequence.shape[0] if self.framework == "pt": generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:]) elif self.framework == "tf": generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])) diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py index 0d6faf0b..55d6e147 100644 --- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py +++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py @@ -12,7 +12,14 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine def generate(self, prompt: str, **attrs: t.Any) -> list[str]: eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): # type: ignore[attr-defined] - return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()), skip_special_tokens=True) + return self.tokenizer.batch_decode( + self.model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config() + ), + skip_special_tokens=True + ) def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal["generated_text"], str]]: max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop("max_new_tokens", 200), self.tokenizer(prompt, return_tensors="pt").to(self.device) diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py index 6936fbed..ae24cd17 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -7,7 +7,10 @@ class FlanT5(openllm.LLM["transformers.T5ForConditionalGeneration", "transformer def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): - return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + return self.tokenizer.batch_decode( + self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True + ) def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: import torch, torch.nn.functional as F diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py index f6661c9e..c22881ce 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py @@ -6,11 +6,38 @@ if t.TYPE_CHECKING: import transformers class FlaxFlanT5(openllm.LLM["transformers.FlaxT5ForConditionalGeneration", "transformers.T5TokenizerFast"]): __openllm_internal__ = True - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, decoder_start_token_id: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_k: int | None = None, + top_p: float | None = None, + repetition_penalty: float | None = None, + decoder_start_token_id: int | None = None, + use_default_prompt_template: bool = True, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: if decoder_start_token_id is None: decoder_start_token_id = 0 - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty, "decoder_start_token_id": decoder_start_token_id}, {} + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, + "temperature": temperature, + "top_k": top_k, + "top_p": top_p, + "repetition_penalty": repetition_penalty, + "decoder_start_token_id": decoder_start_token_id + }, {} def generate(self, prompt: str, **attrs: t.Any) -> list[str]: # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation. decoder_start_token_id = attrs.pop("decoder_start_token_id", 0) - return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="np")["input_ids"], do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True) + return self.tokenizer.batch_decode( + self.model.generate( + self.tokenizer(prompt, return_tensors="np")["input_ids"], + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + decoder_start_token_id=decoder_start_token_id + ).sequences, + skip_special_tokens=True, + clean_up_tokenization_spaces=True + ) diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py index 43d656b5..ef91ee90 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py @@ -5,4 +5,7 @@ class TFFlanT5(openllm.LLM["transformers.TFT5ForConditionalGeneration", "transfo __openllm_internal__ = True def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + return self.tokenizer.batch_decode( + self.model.generate(self.tokenizer(prompt, return_tensors="tf").input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True + ) diff --git a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py index 03df3cc6..1fc72d3f 100644 --- a/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py +++ b/openllm-python/src/openllm/models/gpt_neox/modeling_gpt_neox.py @@ -20,4 +20,12 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): - return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))) + return self.tokenizer.batch_decode( + self.model.generate( + self.tokenizer(prompt, return_tensors="pt").to(self.device).input_ids, + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + pad_token_id=self.tokenizer.eos_token_id, + stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]) + ) + ) diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py index 4c044035..927d1fc5 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py @@ -4,12 +4,17 @@ from openllm.utils import generate_labels, is_triton_available if t.TYPE_CHECKING: import transformers, torch logger = logging.getLogger(__name__) -def get_mpt_config(model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True) -> transformers.PretrainedConfig: +def get_mpt_config( + model_id_or_path: str, max_sequence_length: int, device: torch.device | str | int | None, device_map: str | None = None, trust_remote_code: bool = True +) -> transformers.PretrainedConfig: import torch config = transformers.AutoConfig.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) if hasattr(config, "init_device") and device_map is None and isinstance(device, (str, torch.device)): config.init_device = str(device) if hasattr(config, "attn_config") and is_triton_available(): config.attn_config["attn_impl"] = "triton" - else: logger.debug("'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'") + else: + logger.debug( + "'triton' is not available, Flash Attention will use the default Torch implementation. For faster inference, make sure to install triton with 'pip install \"git+https://github.com/openai/triton.git#egg=triton&subdirectory=python\"'" + ) # setting max_seq_len config.max_seq_len = max_sequence_length return config @@ -46,7 +51,9 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken device_map = attrs.pop("device_map", None) trust_remote_code = attrs.pop("trust_remote_code", True) config = get_mpt_config(self._bentomodel.path, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code,) - model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs) + model = transformers.AutoModelForCausalLM.from_pretrained( + self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map=device_map, **attrs + ) model.tie_weights() return model @@ -54,7 +61,12 @@ class MPT(openllm.LLM["transformers.PreTrainedModel", "transformers.GPTNeoXToken import torch llm_config = self.config.model_construct_env(**attrs) inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) - attrs = {"do_sample": False if llm_config["temperature"] == 0 else True, "eos_token_id": self.tokenizer.eos_token_id, "pad_token_id": self.tokenizer.pad_token_id, "generation_config": llm_config.to_generation_config()} + attrs = { + "do_sample": False if llm_config["temperature"] == 0 else True, + "eos_token_id": self.tokenizer.eos_token_id, + "pad_token_id": self.tokenizer.pad_token_id, + "generation_config": llm_config.to_generation_config() + } with torch.inference_mode(): if torch.cuda.is_available(): with torch.autocast("cuda", torch.float16): # type: ignore[attr-defined] diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py index 81c66d80..078fd574 100644 --- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py @@ -13,10 +13,27 @@ class FlaxOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Tok def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id - return bentoml.transformers.save_model(self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) + return bentoml.transformers.save_model( + self.tag, transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self) + ) - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty}, {} + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_k: int | None = None, + num_return_sequences: int | None = None, + repetition_penalty: float | None = None, + use_default_prompt_template: bool = False, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences, "repetition_penalty": repetition_penalty + }, {} def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True) + return self.tokenizer.batch_decode( + self.model.generate(**self.tokenizer(prompt, return_tensors="np"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, + skip_special_tokens=True + ) diff --git a/openllm-python/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py index 601d038c..ef7cb428 100644 --- a/openllm-python/src/openllm/models/opt/modeling_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_opt.py @@ -14,4 +14,7 @@ class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): - return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + return self.tokenizer.batch_decode( + self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True + ) diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py index 0e66335e..4e239f75 100644 --- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py @@ -9,7 +9,15 @@ class TFOPT(openllm.LLM["transformers.TFOPTForCausalLM", "transformers.GPT2Token import transformers config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id - return bentoml.transformers.save_model(self.tag, transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={"tokenizer": tokenizer}, labels=generate_labels(self)) + return bentoml.transformers.save_model( + self.tag, + transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), + custom_objects={"tokenizer": tokenizer}, + labels=generate_labels(self) + ) def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) + return self.tokenizer.batch_decode( + self.model.generate(**self.tokenizer(prompt, return_tensors="tf"), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True + ) diff --git a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py index f3b78975..e5e7491b 100644 --- a/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_vllm_opt.py @@ -7,5 +7,16 @@ class VLLMOPT(openllm.LLM["vllm.LLMEngine", "transformers.GPT2Tokenizer"]): __openllm_internal__ = True tokenizer_id = "local" - def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {} + def sanitize_parameters( + self, + prompt: str, + max_new_tokens: int | None = None, + temperature: float | None = None, + top_k: int | None = None, + num_return_sequences: int | None = None, + use_default_prompt_template: bool = True, + **attrs: t.Any + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), { + "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences + }, {} diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py index 3e7f8e13..2b5b0e36 100644 --- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py +++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py @@ -16,4 +16,15 @@ class StableLM(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTN def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): - return [self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), do_sample=True, generation_config=self.config.model_construct_env(**attrs).to_generation_config(), pad_token_id=self.tokenizer.eos_token_id, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], skip_special_tokens=True)] + return [ + self.tokenizer.decode( + self.model.generate( + **self.tokenizer(prompt, return_tensors="pt").to(self.device), + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + pad_token_id=self.tokenizer.eos_token_id, + stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]) + )[0], + skip_special_tokens=True + ) + ] diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py index db789d15..f48ccb1f 100644 --- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py +++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py @@ -27,7 +27,12 @@ class StarCoder(openllm.LLM["transformers.GPTBigCodeForCausalLM", "transformers. with torch.inference_mode(): # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder # NOTE: support fine-tuning starcoder - result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, generation_config=self.config.model_construct_env(**attrs).to_generation_config()) + result_tensor = self.model.generate( + self.tokenizer.encode(prompt, return_tensors="pt").to(self.device), + do_sample=True, + pad_token_id=self.tokenizer.eos_token_id, + generation_config=self.config.model_construct_env(**attrs).to_generation_config() + ) # TODO: We will probably want to return the tokenizer here so that we can manually process this # return (skip_special_tokens=False, clean_up_tokenization_spaces=False)) return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index 9f3ef617..d00f5a44 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -47,9 +47,11 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: try: tokenizer = cloudpickle.load(t.cast("t.IO[bytes]", cofile))["tokenizer"] except KeyError: - raise openllm.exceptions.OpenLLMException("Bento model does not have tokenizer. Make sure to save" - " the tokenizer within the model via 'custom_objects'." - " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"") from None + raise openllm.exceptions.OpenLLMException( + "Bento model does not have tokenizer. Make sure to save" + " the tokenizer within the model via 'custom_objects'." + " For example: \"bentoml.transformers.save_model(..., custom_objects={'tokenizer': tokenizer})\"" + ) from None else: tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(bentomodel_fs.getsyspath("/"), trust_remote_code=llm.__llm_trust_remote_code__, **tokenizer_attrs) diff --git a/openllm-python/src/openllm/serialisation/constants.py b/openllm-python/src/openllm/serialisation/constants.py index 4537d9a8..8b0a166c 100644 --- a/openllm-python/src/openllm/serialisation/constants.py +++ b/openllm-python/src/openllm/serialisation/constants.py @@ -1,3 +1,8 @@ from __future__ import annotations -FRAMEWORK_TO_AUTOCLASS_MAPPING = {"pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"), "tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"), "flax": ("FlaxAutoModelForCausalLM", "FlaxAutoModelForSeq2SeqLM"), "vllm": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM")} +FRAMEWORK_TO_AUTOCLASS_MAPPING = { + "pt": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM"), + "tf": ("TFAutoModelForCausalLM", "TFAutoModelForSeq2SeqLM"), + "flax": ("FlaxAutoModelForCausalLM", "FlaxAutoModelForSeq2SeqLM"), + "vllm": ("AutoModelForCausalLM", "AutoModelForSeq2SeqLM") +} HUB_ATTRS = ["cache_dir", "code_revision", "force_download", "local_files_only", "proxies", "resume_download", "revision", "subfolder", "use_auth_token"] diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index c95ef99f..8bfa9b32 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -55,7 +55,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, signatures: DictStrAny = {} if quantize_method == "gptq": - if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + if not openllm.utils.is_autogptq_available(): + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") signatures["generate"] = {"batchable": False} else: @@ -70,16 +71,33 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool, external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)] imported_modules: list[types.ModuleType] = [] - bentomodel = bentoml.Model.create(llm.tag, module="openllm.serialisation.transformers", api_version="v1", options=ModelOptions(), context=openllm.utils.generate_context(framework_name="openllm"), labels=openllm.utils.generate_labels(llm), signatures=signatures if signatures else make_model_signatures(llm)) + bentomodel = bentoml.Model.create( + llm.tag, + module="openllm.serialisation.transformers", + api_version="v1", + options=ModelOptions(), + context=openllm.utils.generate_context(framework_name="openllm"), + labels=openllm.utils.generate_labels(llm), + signatures=signatures if signatures else make_model_signatures(llm) + ) with openllm.utils.analytics.set_bentoml_tracking(): try: bentomodel.enter_cloudpickle_context(external_modules, imported_modules) tokenizer.save_pretrained(bentomodel.path) if quantize_method == "gptq": - if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + if not openllm.utils.is_autogptq_available(): + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") logger.debug("Saving model with GPTQ quantisation will require loading model into memory.") - model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id, *decls, quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), trust_remote_code=trust_remote_code, use_safetensors=safe_serialisation, **hub_attrs, **attrs,) + model = autogptq.AutoGPTQForCausalLM.from_quantized( + llm.model_id, + *decls, + quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), + trust_remote_code=trust_remote_code, + use_safetensors=safe_serialisation, + **hub_attrs, + **attrs, + ) update_model(bentomodel, metadata={"_pretrained_class": model.__class__.__name__, "_framework": model.model.framework}) model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation) else: @@ -120,8 +138,10 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: """ try: model = bentoml.models.get(llm.tag) - if model.info.module not in ("openllm.serialisation.transformers" - "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__): # NOTE: backward compatible with previous version of OpenLLM. + if model.info.module not in ( + "openllm.serialisation.transformers" + "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__ + ): # NOTE: backward compatible with previous version of OpenLLM. raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.") if "runtime" in model.info.labels and model.info.labels["runtime"] != llm.runtime: raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.") @@ -136,26 +156,51 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: If model is not found, it will raises a ``bentoml.exceptions.NotFound``. """ config, hub_attrs, attrs = process_config(llm.model_id, llm.__llm_trust_remote_code__, **attrs) - safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get("safe_serialisation", None)), attrs.pop("safe_serialization", None), default=llm._serialisation_format == "safetensors") + safe_serialization = openllm.utils.first_not_none( + t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get("safe_serialisation", None)), attrs.pop("safe_serialization", None), default=llm._serialisation_format == "safetensors" + ) if "_quantize" in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata["_quantize"] == "gptq": - if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + if not openllm.utils.is_autogptq_available(): + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") - return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, *decls, quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), trust_remote_code=llm.__llm_trust_remote_code__, use_safetensors=safe_serialization, **hub_attrs, **attrs) + return autogptq.AutoGPTQForCausalLM.from_quantized( + llm._bentomodel.path, + *decls, + quantize_config=t.cast("autogptq.BaseQuantizeConfig", llm.quantization_config), + trust_remote_code=llm.__llm_trust_remote_code__, + use_safetensors=safe_serialization, + **hub_attrs, + **attrs + ) device_map = attrs.pop("device_map", "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None) - model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs).eval() + model = infer_autoclass_from_llm(llm, config).from_pretrained( + llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs + ).eval() # BetterTransformer is currently only supported on PyTorch. if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer() if llm.__llm_implementation__ in {"pt", "vllm"}: check_unintialised_params(model) return t.cast("M", model) -def save_pretrained(llm: openllm.LLM[M, T], save_directory: str, is_main_process: bool = True, state_dict: DictStrAny | None = None, save_function: t.Any | None = None, push_to_hub: bool = False, max_shard_size: int | str = "10GB", safe_serialization: bool = False, variant: str | None = None, **attrs: t.Any) -> None: +def save_pretrained( + llm: openllm.LLM[M, T], + save_directory: str, + is_main_process: bool = True, + state_dict: DictStrAny | None = None, + save_function: t.Any | None = None, + push_to_hub: bool = False, + max_shard_size: int | str = "10GB", + safe_serialization: bool = False, + variant: str | None = None, + **attrs: t.Any +) -> None: save_function = t.cast(t.Callable[..., None], openllm.utils.first_not_none(save_function, default=torch.save)) model_save_attrs, tokenizer_save_attrs = openllm.utils.normalize_attrs_to_model_tokenizer_pair(**attrs) safe_serialization = safe_serialization or llm._serialisation_format == "safetensors" # NOTE: disable safetensors for vllm if llm.__llm_implementation__ == "vllm": safe_serialization = False if llm._quantize_method == "gptq": - if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") + if not openllm.utils.is_autogptq_available(): + raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config["model_type"] != "causal_lm": raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") if not openllm.utils.lenient_issubclass(llm.model, autogptq.modeling.BaseGPTQForCausalLM): raise ValueError(f"Model is not a BaseGPTQForCausalLM (type: {type(llm.model)})") t.cast("autogptq.modeling.BaseGPTQForCausalLM", llm.model).save_quantized(save_directory, use_safetensors=safe_serialization) @@ -165,5 +210,15 @@ def save_pretrained(llm: openllm.LLM[M, T], save_directory: str, is_main_process llm.model.save_pretrained(save_directory, safe_serialization=safe_serialization) else: # We can safely cast here since it will be the PreTrainedModel protocol. - t.cast("transformers.PreTrainedModel", llm.model).save_pretrained(save_directory, is_main_process=is_main_process, state_dict=state_dict, save_function=save_function, push_to_hub=push_to_hub, max_shard_size=max_shard_size, safe_serialization=safe_serialization, variant=variant, **model_save_attrs) + t.cast("transformers.PreTrainedModel", llm.model).save_pretrained( + save_directory, + is_main_process=is_main_process, + state_dict=state_dict, + save_function=save_function, + push_to_hub=push_to_hub, + max_shard_size=max_shard_size, + safe_serialization=safe_serialization, + variant=variant, + **model_save_attrs + ) llm.tokenizer.save_pretrained(save_directory, push_to_hub=push_to_hub, **tokenizer_save_attrs) diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index d1638572..272bf5c2 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -38,7 +38,8 @@ def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T: def infer_autoclass_from_llm(llm: openllm.LLM[M, T], config: transformers.PretrainedConfig, /) -> _BaseAutoModelClass: if llm.config["trust_remote_code"]: autoclass = "AutoModelForSeq2SeqLM" if llm.config["model_type"] == "seq2seq_lm" else "AutoModelForCausalLM" - if not hasattr(config, "auto_map"): raise ValueError(f"Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping") + if not hasattr(config, "auto_map"): + raise ValueError(f"Invalid configuraiton for {llm.model_id}. ``trust_remote_code=True`` requires `transformers.PretrainedConfig` to contain a `auto_map` mapping") # in case this model doesn't use the correct auto class for model type, for example like chatglm # where it uses AutoModel instead of AutoModelForCausalLM. Then we fallback to AutoModel if autoclass not in config.auto_map: autoclass = "AutoModel" diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index ad2f54cc..31144003 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -5,7 +5,9 @@ if t.TYPE_CHECKING: from ._typing_compat import LiteralRuntime logger = logging.getLogger(__name__) @contextlib.contextmanager -def build_bento(model: str, model_id: str | None = None, quantize: t.Literal["int4", "int8", "gptq"] | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", cleanup: bool = False) -> t.Iterator[bentoml.Bento]: +def build_bento( + model: str, model_id: str | None = None, quantize: t.Literal["int4", "int8", "gptq"] | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers", cleanup: bool = False +) -> t.Iterator[bentoml.Bento]: logger.info("Building BentoML for %s", model) bento = openllm.build(model, model_id=model_id, quantize=quantize, runtime=runtime) yield bento @@ -28,7 +30,14 @@ def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | N logger.info("Deleting container %s", image_tag) subprocess.check_output([executable, "rmi", "-f", image_tag]) @contextlib.contextmanager -def prepare(model: str, model_id: str | None = None, implementation: LiteralRuntime = "pt", deployment_mode: t.Literal["container", "local"] = "local", clean_context: contextlib.ExitStack | None = None, cleanup: bool = True) -> t.Iterator[str]: +def prepare( + model: str, + model_id: str | None = None, + implementation: LiteralRuntime = "pt", + deployment_mode: t.Literal["container", "local"] = "local", + clean_context: contextlib.ExitStack | None = None, + cleanup: bool = True +) -> t.Iterator[str]: if clean_context is None: clean_context = contextlib.ExitStack() cleanup = True diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py index e0fdf576..1f8e00a4 100644 --- a/openllm-python/tests/_strategies/_configuration.py +++ b/openllm-python/tests/_strategies/_configuration.py @@ -24,7 +24,12 @@ def model_settings(draw: st.DrawFn): "workers_per_resource": st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)), } return draw(st.builds(ModelSettings, **kwargs)) -def make_llm_config(cls_name: str, dunder_config: dict[str, t.Any] | ModelSettings, fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None, generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None,) -> type[openllm.LLMConfig]: +def make_llm_config( + cls_name: str, + dunder_config: dict[str, t.Any] | ModelSettings, + fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None, + generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None, +) -> type[openllm.LLMConfig]: globs: dict[str, t.Any] = {"openllm": openllm} _config_args: list[str] = [] lines: list[str] = [f"class {cls_name}Config(openllm.LLMConfig):"] diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py index 7eba9618..7ca43b50 100644 --- a/openllm-python/tests/configuration_test.py +++ b/openllm-python/tests/configuration_test.py @@ -14,7 +14,11 @@ def test_missing_default(): with pytest.raises(ValueError, match="Missing required fields *"): make_llm_config("MissingArchitecture", {"default_id": "huggingface/t5-tiny-testing", "model_ids": ["huggingface/t5-tiny-testing"], "requirements": ["bentoml"],},) def test_forbidden_access(): - cl_ = make_llm_config("ForbiddenAccess", {"default_id": "huggingface/t5-tiny-testing", "model_ids": ["huggingface/t5-tiny-testing", "bentoml/t5-tiny-testing"], "architecture": "PreTrainedModel", "requirements": ["bentoml"],},) + cl_ = make_llm_config( + "ForbiddenAccess", { + "default_id": "huggingface/t5-tiny-testing", "model_ids": ["huggingface/t5-tiny-testing", "bentoml/t5-tiny-testing"], "architecture": "PreTrainedModel", "requirements": ["bentoml"], + }, + ) assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "__config__",) assert pytest.raises(openllm.exceptions.ForbiddenAttributeError, cl_.__getattribute__, cl_(), "GenerationConfig",) @@ -94,7 +98,11 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat with monkeypatch.context() as mk: mk.setenv(field_env_key("overwrite_with_env_available", "field1"), str(4.0)) mk.setenv(field_env_key("overwrite_with_env_available", "temperature", suffix="generation"), str(0.2)) - sent = make_llm_config("OverwriteWithEnvAvailable", {"default_id": "asdfasdf", "model_ids": ["asdf", "asdfasdfads"], "architecture": "PreTrainedModel"}, fields=(("field1", "float", 3.0),),).model_construct_env(field1=20.0, temperature=0.4) + sent = make_llm_config( + "OverwriteWithEnvAvailable", { + "default_id": "asdfasdf", "model_ids": ["asdf", "asdfasdfads"], "architecture": "PreTrainedModel" + }, fields=(("field1", "float", 3.0),), + ).model_construct_env(field1=20.0, temperature=0.4) assert sent.generation_config.temperature == 0.4 assert sent.field1 == 20.0 @given(model_settings()) diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py index a2c31a48..7f18baae 100644 --- a/openllm-python/tests/models/conftest.py +++ b/openllm-python/tests/models/conftest.py @@ -109,7 +109,9 @@ class DockerHandle(_Handle): container = self.docker_client.containers.get(self.container_name) return container.status in ["running", "created"] @contextlib.contextmanager -def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal["container", "local"], quantize: t.Literal["int8", "int4", "gptq"] | None = None, *, _serve_grpc: bool = False,): +def _local_handle( + model: str, model_id: str, image_tag: str, deployment_mode: t.Literal["container", "local"], quantize: t.Literal["int8", "int4", "gptq"] | None = None, *, _serve_grpc: bool = False, +): with openllm.utils.reserve_free_port() as port: pass @@ -129,7 +131,9 @@ def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t. if proc.stderr: proc.stderr.close() @contextlib.contextmanager -def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal["container", "local"], quantize: t.Literal["int8", "int4", "gptq"] | None = None, *, _serve_grpc: bool = False,): +def _container_handle( + model: str, model_id: str, image_tag: str, deployment_mode: t.Literal["container", "local"], quantize: t.Literal["int8", "int4", "gptq"] | None = None, *, _serve_grpc: bool = False, +): envvar = openllm.utils.EnvVarMixin(model) with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: @@ -154,7 +158,11 @@ def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode gpus = openllm.utils.device_count() or -1 devs = [docker.types.DeviceRequest(count=gpus, capabilities=[["gpu"]])] if gpus > 0 else None - container = client.containers.run(image_tag, command=args, name=container_name, environment=env, auto_remove=False, detach=True, device_requests=devs, ports={"3000/tcp": port, "3001/tcp": prom_port},) + container = client.containers.run( + image_tag, command=args, name=container_name, environment=env, auto_remove=False, detach=True, device_requests=devs, ports={ + "3000/tcp": port, "3001/tcp": prom_port + }, + ) yield DockerHandle(client, container.name, port, deployment_mode) diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py index 69c57916..7513cca8 100644 --- a/openllm-python/tests/package_test.py +++ b/openllm-python/tests/package_test.py @@ -5,7 +5,9 @@ if t.TYPE_CHECKING: from pathlib import Path HF_INTERNAL_T5_TESTING = "hf-internal-testing/tiny-random-t5" -actions_xfail = functools.partial(pytest.mark.xfail, condition=os.getenv("GITHUB_ACTIONS") is not None, reason="Marking GitHub Actions to xfail due to flakiness and building environment not isolated.",) +actions_xfail = functools.partial( + pytest.mark.xfail, condition=os.getenv("GITHUB_ACTIONS") is not None, reason="Marking GitHub Actions to xfail due to flakiness and building environment not isolated.", +) @actions_xfail def test_general_build_with_internal_testing(): bento_store = BentoMLContainer.bento_store.get() diff --git a/pyproject.toml b/pyproject.toml index 5fb19122..efd9df71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -266,7 +266,7 @@ BLANK_LINE_BEFORE_CLASS_DOCSTRING = false BLANK_LINE_BEFORE_MODULE_DOCSTRING = false BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false COALESCE_BRACKETS = true -COLUMN_LIMIT = 512 +COLUMN_LIMIT = 192 CONTINUATION_ALIGN_STYLE = "VALIGN-RIGHT" DEDENT_CLOSING_BRACKETS = true DISABLE_ENDING_COMMA_HEURISTIC = true