feat: token streaming and SSE support (#240)

This commit is contained in:
Aaron Pham
2023-08-20 07:32:49 -04:00
committed by GitHub
parent 69fb149069
commit 9e205b4963
8 changed files with 161 additions and 87 deletions

View File

@@ -3,8 +3,10 @@ from __future__ import annotations
import typing as t, transformers
if t.TYPE_CHECKING: import torch, openllm
# reexport from transformers
LogitsProcessorList = transformers.LogitsProcessorList
StoppingCriteriaList = transformers.StoppingCriteriaList
class StopSequenceCriteria(transformers.StoppingCriteria):
def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
@@ -20,3 +22,18 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
if 1e-8 <= generation_config["top_p"]: logits_processor.append(transformers.TopPLogitsWarper(generation_config["top_p"]))
if generation_config["top_k"] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config["top_k"]))
return logits_processor
# NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used.
SEQLEN_KEYS = ["max_sequence_length", "seq_length", "max_position_embeddings", "max_seq_len", "model_max_length"]
def get_context_length(config: transformers.PretrainedConfig) -> int:
rope_scaling = getattr(config, "rope_scaling", None)
rope_scaling_factor = config.rope_scaling["factor"] if rope_scaling else 1.0
for key in SEQLEN_KEYS:
if getattr(config, key, None) is not None: return int(rope_scaling_factor*getattr(config,key))
return 2048
def is_sentence_complete(output: str) -> bool: return output.endswith((".", "?", "!", "...", "", "?", "!", "", '"', "'", ""))
def is_partial_stop(output: str, stop_str: str) -> bool:
"""Check whether the output contains a partial stop str."""
for i in range(0, min(len(output), len(stop_str))):
if stop_str.startswith(output[-i:]): return True
return False

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import functools, inspect, logging, os, re, traceback, types, typing as t, uuid
from abc import ABC, abstractmethod
from pathlib import Path
import attr, fs.path, inflection, orjson, bentoml, openllm
import attr, fs.path, inflection, orjson, bentoml, openllm, gc
from huggingface_hub import hf_hub_download
from bentoml._internal.models.model import ModelSignature
@@ -152,7 +152,7 @@ class LLMInterface(ABC, t.Generic[M, T]):
"""
raise NotImplementedError
def generate_iterator(self, prompt: str, **attrs: t.Any) -> t.Iterator[t.Any]:
def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]:
"""The iterator version of `generate` function."""
raise NotImplementedError("Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.")
@@ -605,8 +605,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
return f"{tag_name}:{model_version}"
@classmethod
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
def __init__(
self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str,
@@ -711,7 +710,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
# handle trust_remote_code
self.__llm_trust_remote_code__ = self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"])
_from_env = os.getenv("TRUST_REMOTE_CODE", None)
self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]))
self.llm_post_init()
# we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
@@ -725,55 +725,30 @@ class LLM(LLMInterface[M, T], ReprMixin):
super().__setattr__(attr, value)
@property
def adapters_mapping(self) -> AdaptersMapping | None:
return self._adapters_mapping
def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping
@adapters_mapping.setter
def adapters_mapping(self, value: AdaptersMapping) -> None:
self._adapters_mapping = value
def adapters_mapping(self, value: AdaptersMapping) -> None: self._adapters_mapping = value
@property
def __repr_keys__(self) -> set[str]:
return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"}
def __repr_keys__(self) -> set[str]: return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"}
def __repr_args__(self) -> ReprArgs:
for k in self.__repr_keys__:
if k == "config": yield k, self.config.model_dump(flatten=True)
else: yield k, getattr(self, k)
@property
def model_id(self) -> str:
return self._model_id
def model_id(self) -> str: return self._model_id
@property
def runtime(self) -> t.Literal["ggml", "transformers"]:
return self._runtime
def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime
@property
def runner_name(self) -> str:
return f"llm-{self.config['start_name']}-runner"
# NOTE: The section below defines a loose contract with langchain's LLM interface.
def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner"
@property
def llm_type(self) -> str:
return normalise_model_name(self._model_id)
def llm_type(self) -> str: return normalise_model_name(self._model_id)
@property
def identifying_params(self) -> DictStrAny:
return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()}
def identifying_params(self) -> DictStrAny: return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()}
@property
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
return (self._model_decls, self._model_attrs), self._tokenizer_attrs
def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: return (self._model_decls, self._model_attrs), self._tokenizer_attrs
@property
def tag(self) -> bentoml.Tag:
return self._tag
# ensure_model_id_exists can be called to save the model to local store
def ensure_model_id_exists(self) -> bentoml.Model:
from . import import_model
return import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format)
def tag(self) -> bentoml.Tag: return self._tag
def ensure_model_id_exists(self) -> bentoml.Model: return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format)
@property
def _bentomodel(self) -> bentoml.Model:
@@ -916,10 +891,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
"""
models = models if models is not None else []
try:
models.append(self._bentomodel)
except bentoml.exceptions.NotFound as err:
raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None
try: models.append(self._bentomodel)
except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None
if scheduling_strategy is None:
from ._strategies import CascadingResourceStrategy
@@ -927,7 +900,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))
generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
# NOTE: returning the two langchain API's to the runner
return llm_runner_class(self)(
@@ -936,9 +909,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
)
# NOTE: Scikit API
def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
return self.__call__(prompt, **attrs)
def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return self(prompt, **attrs)
def __call__(self, prompt: str, **attrs: t.Any) -> t.Any:
"""Returns the generation result and format the result.
@@ -956,7 +927,95 @@ class LLM(LLMInterface[M, T], ReprMixin):
prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs)
return self.postprocess_generate(prompt, self.generate(prompt, **generate_kwargs), **postprocess_kwargs)
# fmt: off
def generate(self, prompt: str, **attrs: t.Any) -> t.Any:
# TODO: support different generation strategies, similar to self.model.generate
for it in self.generate_iterator(prompt, **attrs): pass
return it
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> str:
if isinstance(generation_result, dict): return generation_result["text"]
return generation_result
def generate_iterator(self, prompt: str, /,
*, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]:
# NOTE: encoder-decoder models will need to implement their own generate_iterator for now
# inspired from fastchat's generate_stream_func
from ._generation import prepare_logits_processor, get_context_length, is_partial_stop
len_prompt = len(prompt)
if stop_token_ids is None: stop_token_ids = []
stop_token_ids.append(self.tokenizer.eos_token_id)
logits_processor = prepare_logits_processor(self.config)
input_ids = self.tokenizer(prompt).input_ids
if context_length is None: context_length = get_context_length(self.model.config)
max_src_len = context_length - self.config["max_new_tokens"] - 1
input_ids = input_ids[-max_src_len:]
output_ids = list(input_ids)
input_echo_len = len(input_ids)
past_key_values = out = token = None
for i in range(self.config["max_new_tokens"]):
if i == 0: # prefill
out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
else: # decoding
out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values) # type: ignore[has-type]
logits = out.logits
past_key_values = out.past_key_values
last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device) if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :]
# Switch to CPU by avoiding some bugs in mps backend.
if self.device.type == "mps": last_token_logits = last_token_logits.float().to("cpu")
if self.config["temperature"] < 1e-5 or self.config["top_p"] < 1e-8: token = int(torch.argmax(last_token_logits)) # greedy
else: token = int(torch.multinomial(torch.softmax(last_token_logits, dim=-1), num_samples=1))
output_ids.append(token)
if token in stop_token_ids: stopped = True
else: stopped = False
# Yield the output tokens
if i % stream_interval == 0 or i == self.config["max_new_tokens"] - 1 or stopped:
tmp_output_ids = output_ids if echo else output_ids[input_echo_len:]
rfind_start = len_prompt if echo else 0
output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
partially_stopped = False
if stop:
if isinstance(stop, str):
pos = output.rfind(stop, rfind_start)
if pos != -1: output, stopped = output[:pos], True
else: partially_stopped = is_partial_stop(output, stop)
elif isinstance(stop, t.Iterable):
for each_stop in stop:
pos = output.rfind(each_stop, rfind_start)
if pos != -1:
output, stopped = output[:pos], True
break
else:
partially_stopped = is_partial_stop(output, each_stop)
if partially_stopped: break
else: raise ValueError("Invalid stop field type.")
# Prevent yielding partial stop sequence
if not partially_stopped:
yield {"text": output, "usage": {"prompt_tokens": input_echo_len, "completion_tokens": i, "total_tokens": input_echo_len + i}, "finish_reason": None}
if stopped: break
# Finish stream event, which contains finish reason
if i == self.config["max_new_tokens"] - 1: finish_reason = "length"
elif stopped: finish_reason = "stop"
else: finish_reason = None
yield {"text": output, "usage": {"prompt_tokens": input_echo_len, "completion_tokens": i, "total_tokens": input_echo_len + i}, "finish_reason": finish_reason}
# Clean
del past_key_values, out
gc.collect()
torch.cuda.empty_cache()
@overload
def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
@@ -965,8 +1024,6 @@ def Runner(model_name: str, *, model_id: str = ..., model_version: str | None =
def Runner(model_name: str, *, ensure_available: bool | None = None, init_local: bool = ..., implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
def Runner(model_name: str, *, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4", "gptq"] | None = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
# fmt: on
def Runner(model_name: str, ensure_available: bool | None = None, init_local: bool = False, implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
"""Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.
@@ -1008,9 +1065,7 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo
if init_local: runner.init_local(quiet=True)
return runner
def method_signature(sig: ModelSignature) -> ModelSignatureDict:
return bentoml_cattr.unstructure(sig)
def method_signature(sig: ModelSignature) -> ModelSignatureDict: return bentoml_cattr.unstructure(sig)
class SetAdapterOutput(t.TypedDict):
success: bool
message: str
@@ -1019,7 +1074,6 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
class _Runnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu", "cpu")
SUPPORTS_CPU_MULTI_THREADING = True
def __init__(__self: _Runnable):
# NOTE: The side effect of this line
# is that it will load the imported model during
@@ -1028,40 +1082,41 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
if self.adapters_mapping is not None:
logger.info("Applying LoRA to %s...", self.runner_name)
self.apply_adapter(inference_mode=True, load_adapters="all")
def set_adapter(__self: _Runnable, adapter_name: str) -> None:
if self.__llm_adapter_map__ is None: raise ValueError("No adapters available for current running server.")
elif not isinstance(self.model, peft.PeftModel): raise RuntimeError("Model is not a PeftModel")
if adapter_name != "default": self.model.set_adapter(adapter_name)
logger.info("Successfully apply LoRA layer %s", adapter_name)
@bentoml.Runnable.method(**method_signature(embeddings_sig))
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
@bentoml.Runnable.method(**method_signature(generate_sig))
def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop("adapter_name", None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate(prompt, **attrs)
@bentoml.Runnable.method(**method_signature(generate_sig))
def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
adapter_name = attrs.pop("adapter_name", None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate(prompt, **attrs)
@bentoml.Runnable.method(**method_signature(generate_sig))
def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal["generated_text"], str]]:
adapter_name = attrs.pop("adapter_name", None)
if adapter_name is not None: __self.set_adapter(adapter_name)
return self.generate_one(prompt, stop, **attrs)
@bentoml.Runnable.method(**method_signature(generate_iterator_sig))
def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[t.Any, None, None]:
def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]:
adapter_name = attrs.pop("adapter_name", None)
if adapter_name is not None: __self.set_adapter(adapter_name)
yield self.generate_iterator(prompt, **attrs)
pre = 0
for outputs in self.generate_iterator(prompt, **attrs):
output_text = outputs["text"].strip().split(" ")
now = len(output_text) - 1
if now > pre:
yield " ".join(output_text[pre:now])
pre = now
yield " ".join(output_text[pre:])
return " ".join(output_text)
return types.new_class(self.__class__.__name__ + "Runnable", (_Runnable,), {}, lambda ns: ns.update({"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "amd.com/gpu") if self.config["requires_gpu"] else ("nvidia.com/gpu", "amd.com/gpu", "cpu"), "__module__": self.__module__, "__doc__": self.config["env"].start_docstring}))
@@ -1098,9 +1153,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
"""
return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt)
def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"}
def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"}
def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
yield "runner_methods", {method.name: {"batchable": method.config.batchable, "batch_dim": method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods}
yield "config", self.config.model_dump(flatten=True)
@@ -1108,13 +1161,8 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
yield "runtime", self.runtime
yield "llm_tag", self.tag
return types.new_class(
self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({
"llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, # NOTE: self reference to LLM
"config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters), "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__, "__repr_keys__": property(
_wrapped_repr_keys
), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None,
}),
)
return types.new_class(self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({"llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters),
"download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__,
"__repr_keys__": property( _wrapped_repr_keys), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None}))
__all__ = ["LLMRunner", "LLMRunnable", "Runner", "LLM", "llm_runner_class", "llm_runnable_class", "LLMEmbeddings"]

View File

@@ -19,14 +19,20 @@ generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name
runners: t.Sequence[bentoml.Runner] = [runner]
if not runner.supports_embeddings: runners.append(generic_embedding_runner)
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
_JsonInput=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True), "adapter_name": ""})
@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
@svc.api(route="/v1/generate", input=_JsonInput, output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
config = qa_inputs.llm_config.model_dump()
responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
return openllm.GenerationOutput(responses=responses, configuration=config)
@svc.api(route="/v1/generate_stream", input=_JsonInput,output=bentoml.io.Text(content_type="text/event_stream"))
async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **qa_inputs.llm_config.model_dump())
@svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent}))
def metadata_v1(_: str) -> openllm.MetadataOutput:
return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)

View File

@@ -69,7 +69,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
class LLMRunner(bentoml.Runner, t.Generic[M, T]):
__doc__: str
@@ -85,7 +85,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: ...
def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: ...
@abc.abstractmethod

View File

@@ -11,9 +11,6 @@ class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaToke
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), generation_config=self.config.model_construct_env(**attrs).to_generation_config(), do_sample=True, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])), skip_special_tokens=True, clean_up_tokenization_spaces=True)
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

View File

@@ -128,7 +128,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
"""
try:
model = bentoml.models.get(llm.tag)
if model.info.module not in ("openllm.serialisation.transformers", __name__):
if model.info.module not in ("openllm.serialisation.transformers" "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__): # NOTE: backward compatible with previous version of OpenLLM.
raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
if "runtime" in model.info.labels and model.info.labels["runtime"] != llm.runtime:
raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")