diff --git a/changelog.d/240.feature.md b/changelog.d/240.feature.md new file mode 100644 index 00000000..9d608a34 --- /dev/null +++ b/changelog.d/240.feature.md @@ -0,0 +1,9 @@ +OpenLLM now provides SSE support + +> [!NOTE] +> For this to work, you must install BentoML from HEAD: +> `pip install 'git+https://github.com/bentoml/BentoML.git@main'` + +The endpoint can be accessed via `/v1/generate_stream` + +Curl in fact does support SSE (by passing in `-N`) diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index d92d3e0b..714bda76 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -3,8 +3,10 @@ from __future__ import annotations import typing as t, transformers if t.TYPE_CHECKING: import torch, openllm +# reexport from transformers LogitsProcessorList = transformers.LogitsProcessorList StoppingCriteriaList = transformers.StoppingCriteriaList + class StopSequenceCriteria(transformers.StoppingCriteria): def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] @@ -20,3 +22,18 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr if 1e-8 <= generation_config["top_p"]: logits_processor.append(transformers.TopPLogitsWarper(generation_config["top_p"])) if generation_config["top_k"] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config["top_k"])) return logits_processor + +# NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used. +SEQLEN_KEYS = ["max_sequence_length", "seq_length", "max_position_embeddings", "max_seq_len", "model_max_length"] +def get_context_length(config: transformers.PretrainedConfig) -> int: + rope_scaling = getattr(config, "rope_scaling", None) + rope_scaling_factor = config.rope_scaling["factor"] if rope_scaling else 1.0 + for key in SEQLEN_KEYS: + if getattr(config, key, None) is not None: return int(rope_scaling_factor*getattr(config,key)) + return 2048 +def is_sentence_complete(output: str) -> bool: return output.endswith((".", "?", "!", "...", "。", "?", "!", "…", '"', "'", "”")) +def is_partial_stop(output: str, stop_str: str) -> bool: + """Check whether the output contains a partial stop str.""" + for i in range(0, min(len(output), len(stop_str))): + if stop_str.startswith(output[-i:]): return True + return False diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index 6a43dde2..a09fda96 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -2,7 +2,7 @@ from __future__ import annotations import functools, inspect, logging, os, re, traceback, types, typing as t, uuid from abc import ABC, abstractmethod from pathlib import Path -import attr, fs.path, inflection, orjson, bentoml, openllm +import attr, fs.path, inflection, orjson, bentoml, openllm, gc from huggingface_hub import hf_hub_download from bentoml._internal.models.model import ModelSignature @@ -152,7 +152,7 @@ class LLMInterface(ABC, t.Generic[M, T]): """ raise NotImplementedError - def generate_iterator(self, prompt: str, **attrs: t.Any) -> t.Iterator[t.Any]: + def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]: """The iterator version of `generate` function.""" raise NotImplementedError("Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.") @@ -605,8 +605,7 @@ class LLM(LLMInterface[M, T], ReprMixin): return f"{tag_name}:{model_version}" @classmethod - def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: - return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) + def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) def __init__( self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str, @@ -711,7 +710,8 @@ class LLM(LLMInterface[M, T], ReprMixin): self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local) # handle trust_remote_code - self.__llm_trust_remote_code__ = self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]) + _from_env = os.getenv("TRUST_REMOTE_CODE", None) + self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"])) self.llm_post_init() # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init @@ -725,55 +725,30 @@ class LLM(LLMInterface[M, T], ReprMixin): super().__setattr__(attr, value) @property - def adapters_mapping(self) -> AdaptersMapping | None: - return self._adapters_mapping - + def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping @adapters_mapping.setter - def adapters_mapping(self, value: AdaptersMapping) -> None: - self._adapters_mapping = value - + def adapters_mapping(self, value: AdaptersMapping) -> None: self._adapters_mapping = value @property - def __repr_keys__(self) -> set[str]: - return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"} - + def __repr_keys__(self) -> set[str]: return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"} def __repr_args__(self) -> ReprArgs: for k in self.__repr_keys__: if k == "config": yield k, self.config.model_dump(flatten=True) else: yield k, getattr(self, k) - @property - def model_id(self) -> str: - return self._model_id - + def model_id(self) -> str: return self._model_id @property - def runtime(self) -> t.Literal["ggml", "transformers"]: - return self._runtime - + def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime @property - def runner_name(self) -> str: - return f"llm-{self.config['start_name']}-runner" - - # NOTE: The section below defines a loose contract with langchain's LLM interface. + def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner" @property - def llm_type(self) -> str: - return normalise_model_name(self._model_id) - + def llm_type(self) -> str: return normalise_model_name(self._model_id) @property - def identifying_params(self) -> DictStrAny: - return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()} - + def identifying_params(self) -> DictStrAny: return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()} @property - def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: - return (self._model_decls, self._model_attrs), self._tokenizer_attrs - + def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: return (self._model_decls, self._model_attrs), self._tokenizer_attrs @property - def tag(self) -> bentoml.Tag: - return self._tag - - # ensure_model_id_exists can be called to save the model to local store - def ensure_model_id_exists(self) -> bentoml.Model: - from . import import_model - return import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format) + def tag(self) -> bentoml.Tag: return self._tag + def ensure_model_id_exists(self) -> bentoml.Model: return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format) @property def _bentomodel(self) -> bentoml.Model: @@ -916,10 +891,8 @@ class LLM(LLMInterface[M, T], ReprMixin): """ models = models if models is not None else [] - try: - models.append(self._bentomodel) - except bentoml.exceptions.NotFound as err: - raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None + try: models.append(self._bentomodel) + except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None if scheduling_strategy is None: from ._strategies import CascadingResourceStrategy @@ -927,7 +900,7 @@ class LLM(LLMInterface[M, T], ReprMixin): generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0))) - generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True))) + generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False))) # NOTE: returning the two langchain API's to the runner return llm_runner_class(self)( @@ -936,9 +909,7 @@ class LLM(LLMInterface[M, T], ReprMixin): ) # NOTE: Scikit API - def predict(self, prompt: str, **attrs: t.Any) -> t.Any: - return self.__call__(prompt, **attrs) - + def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return self(prompt, **attrs) def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: """Returns the generation result and format the result. @@ -956,7 +927,95 @@ class LLM(LLMInterface[M, T], ReprMixin): prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs) return self.postprocess_generate(prompt, self.generate(prompt, **generate_kwargs), **postprocess_kwargs) -# fmt: off + def generate(self, prompt: str, **attrs: t.Any) -> t.Any: + # TODO: support different generation strategies, similar to self.model.generate + for it in self.generate_iterator(prompt, **attrs): pass + return it + + def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> str: + if isinstance(generation_result, dict): return generation_result["text"] + return generation_result + + def generate_iterator(self, prompt: str, /, + *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]: + # NOTE: encoder-decoder models will need to implement their own generate_iterator for now + # inspired from fastchat's generate_stream_func + from ._generation import prepare_logits_processor, get_context_length, is_partial_stop + + len_prompt = len(prompt) + if stop_token_ids is None: stop_token_ids = [] + stop_token_ids.append(self.tokenizer.eos_token_id) + + logits_processor = prepare_logits_processor(self.config) + + input_ids = self.tokenizer(prompt).input_ids + + if context_length is None: context_length = get_context_length(self.model.config) + max_src_len = context_length - self.config["max_new_tokens"] - 1 + + input_ids = input_ids[-max_src_len:] + output_ids = list(input_ids) + input_echo_len = len(input_ids) + + past_key_values = out = token = None + for i in range(self.config["max_new_tokens"]): + if i == 0: # prefill + out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True) + else: # decoding + out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values) # type: ignore[has-type] + logits = out.logits + past_key_values = out.past_key_values + + last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device) if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :] + # Switch to CPU by avoiding some bugs in mps backend. + if self.device.type == "mps": last_token_logits = last_token_logits.float().to("cpu") + + if self.config["temperature"] < 1e-5 or self.config["top_p"] < 1e-8: token = int(torch.argmax(last_token_logits)) # greedy + else: token = int(torch.multinomial(torch.softmax(last_token_logits, dim=-1), num_samples=1)) + output_ids.append(token) + + if token in stop_token_ids: stopped = True + else: stopped = False + + # Yield the output tokens + if i % stream_interval == 0 or i == self.config["max_new_tokens"] - 1 or stopped: + tmp_output_ids = output_ids if echo else output_ids[input_echo_len:] + rfind_start = len_prompt if echo else 0 + output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True) + + partially_stopped = False + if stop: + if isinstance(stop, str): + pos = output.rfind(stop, rfind_start) + if pos != -1: output, stopped = output[:pos], True + else: partially_stopped = is_partial_stop(output, stop) + elif isinstance(stop, t.Iterable): + for each_stop in stop: + pos = output.rfind(each_stop, rfind_start) + if pos != -1: + output, stopped = output[:pos], True + break + else: + partially_stopped = is_partial_stop(output, each_stop) + if partially_stopped: break + else: raise ValueError("Invalid stop field type.") + + # Prevent yielding partial stop sequence + if not partially_stopped: + yield {"text": output, "usage": {"prompt_tokens": input_echo_len, "completion_tokens": i, "total_tokens": input_echo_len + i}, "finish_reason": None} + if stopped: break + + # Finish stream event, which contains finish reason + if i == self.config["max_new_tokens"] - 1: finish_reason = "length" + elif stopped: finish_reason = "stop" + else: finish_reason = None + yield {"text": output, "usage": {"prompt_tokens": input_echo_len, "completion_tokens": i, "total_tokens": input_echo_len + i}, "finish_reason": finish_reason} + + # Clean + del past_key_values, out + gc.collect() + torch.cuda.empty_cache() + @overload def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @overload @@ -965,8 +1024,6 @@ def Runner(model_name: str, *, model_id: str = ..., model_version: str | None = def Runner(model_name: str, *, ensure_available: bool | None = None, init_local: bool = ..., implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... @overload def Runner(model_name: str, *, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4", "gptq"] | None = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ... -# fmt: on - def Runner(model_name: str, ensure_available: bool | None = None, init_local: bool = False, implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'. @@ -1008,9 +1065,7 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo if init_local: runner.init_local(quiet=True) return runner -def method_signature(sig: ModelSignature) -> ModelSignatureDict: - return bentoml_cattr.unstructure(sig) - +def method_signature(sig: ModelSignature) -> ModelSignatureDict: return bentoml_cattr.unstructure(sig) class SetAdapterOutput(t.TypedDict): success: bool message: str @@ -1019,7 +1074,6 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu", "cpu") SUPPORTS_CPU_MULTI_THREADING = True - def __init__(__self: _Runnable): # NOTE: The side effect of this line # is that it will load the imported model during @@ -1028,40 +1082,41 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate if self.adapters_mapping is not None: logger.info("Applying LoRA to %s...", self.runner_name) self.apply_adapter(inference_mode=True, load_adapters="all") - def set_adapter(__self: _Runnable, adapter_name: str) -> None: if self.__llm_adapter_map__ is None: raise ValueError("No adapters available for current running server.") elif not isinstance(self.model, peft.PeftModel): raise RuntimeError("Model is not a PeftModel") if adapter_name != "default": self.model.set_adapter(adapter_name) logger.info("Successfully apply LoRA layer %s", adapter_name) - @bentoml.Runnable.method(**method_signature(embeddings_sig)) - def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: - return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)] - + def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)] @bentoml.Runnable.method(**method_signature(generate_sig)) def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate(prompt, **attrs) - @bentoml.Runnable.method(**method_signature(generate_sig)) def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate(prompt, **attrs) - @bentoml.Runnable.method(**method_signature(generate_sig)) def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal["generated_text"], str]]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate_one(prompt, stop, **attrs) - @bentoml.Runnable.method(**method_signature(generate_iterator_sig)) - def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[t.Any, None, None]: + def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]: adapter_name = attrs.pop("adapter_name", None) if adapter_name is not None: __self.set_adapter(adapter_name) - yield self.generate_iterator(prompt, **attrs) + pre = 0 + for outputs in self.generate_iterator(prompt, **attrs): + output_text = outputs["text"].strip().split(" ") + now = len(output_text) - 1 + if now > pre: + yield " ".join(output_text[pre:now]) + pre = now + yield " ".join(output_text[pre:]) + return " ".join(output_text) return types.new_class(self.__class__.__name__ + "Runnable", (_Runnable,), {}, lambda ns: ns.update({"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "amd.com/gpu") if self.config["requires_gpu"] else ("nvidia.com/gpu", "amd.com/gpu", "cpu"), "__module__": self.__module__, "__doc__": self.config["env"].start_docstring})) @@ -1098,9 +1153,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: """ return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt) - def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: - return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"} - + def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"} def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs: yield "runner_methods", {method.name: {"batchable": method.config.batchable, "batch_dim": method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods} yield "config", self.config.model_dump(flatten=True) @@ -1108,13 +1161,8 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: yield "runtime", self.runtime yield "llm_tag", self.tag - return types.new_class( - self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({ - "llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, # NOTE: self reference to LLM - "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters), "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__, "__repr_keys__": property( - _wrapped_repr_keys - ), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None, - }), - ) + return types.new_class(self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({"llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters), + "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__, + "__repr_keys__": property( _wrapped_repr_keys), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None})) __all__ = ["LLMRunner", "LLMRunnable", "Runner", "LLM", "llm_runner_class", "llm_runnable_class", "LLMEmbeddings"] diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 7ca3b3ef..40a6553e 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -19,14 +19,20 @@ generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name runners: t.Sequence[bentoml.Runner] = [runner] if not runner.supports_embeddings: runners.append(generic_embedding_runner) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners) +_JsonInput=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True), "adapter_name": ""}) -@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)})) +@svc.api(route="/v1/generate", input=_JsonInput, output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) config = qa_inputs.llm_config.model_dump() responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config}) return openllm.GenerationOutput(responses=responses, configuration=config) +@svc.api(route="/v1/generate_stream", input=_JsonInput,output=bentoml.io.Text(content_type="text/event_stream")) +async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: + qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) + return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **qa_inputs.llm_config.model_dump()) + @svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent})) def metadata_v1(_: str) -> openllm.MetadataOutput: return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent) diff --git a/openllm-python/src/openllm/_typing_compat.py b/openllm-python/src/openllm/_typing_compat.py index dd6b35eb..5e1c731b 100644 --- a/openllm-python/src/openllm/_typing_compat.py +++ b/openllm-python/src/openllm/_typing_compat.py @@ -69,7 +69,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]): embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings] generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]] - generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]] + generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] class LLMRunner(bentoml.Runner, t.Generic[M, T]): __doc__: str @@ -85,7 +85,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]] generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]] generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]] - generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]] + generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]] def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: ... def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: ... @abc.abstractmethod diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py index e7915ee1..60226f03 100644 --- a/openllm-python/src/openllm/models/llama/modeling_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_llama.py @@ -11,9 +11,6 @@ class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaToke @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {} def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {} - def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] - def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), generation_config=self.config.model_construct_env(**attrs).to_generation_config(), do_sample=True, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])), skip_special_tokens=True, clean_up_tokenization_spaces=True) def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings: encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device) input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index 0e0c07b6..87afefff 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -128,7 +128,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: """ try: model = bentoml.models.get(llm.tag) - if model.info.module not in ("openllm.serialisation.transformers", __name__): + if model.info.module not in ("openllm.serialisation.transformers" "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__): # NOTE: backward compatible with previous version of OpenLLM. raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.") if "runtime" in model.info.labels and model.info.labels["runtime"] != llm.runtime: raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.") diff --git a/pyproject.toml b/pyproject.toml index b3368009..55c9e1e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,7 +161,6 @@ extend-exclude = [ ] extend-include = ["*.ipynb"] extend-select = [ - "B", # flake8-bugbear "I", # isort "G", # flake8-logging-format "D", # pydocstyle @@ -180,7 +179,6 @@ extend-select = [ ] fix = true ignore = [ - "B027", # Allow non-abstract empty methods in abstract base classes "FBT003", # Allow boolean positional values in function calls, like `dict.get(... True)` "S105", # Ignore checks for possible passwords "S106", @@ -201,7 +199,6 @@ ignore = [ "D100", "TCH004", # don't move runtime import out, just warn about it "RUF012", # mutable attributes to be used with ClassVar - "B905", # zip warning about strict, only applicable for 3.10+ "D105", # magic docstring "E701", # multiple statement on single line ]