feat: token streaming and SSE support (#240)

2026-05-19 14:16:22 -04:00 · 2023-08-20 07:32:49 -04:00
parent 69fb149069
commit 9e205b4963
8 changed files with 161 additions and 87 deletions
--- a/openllm-python/src/openllm/_generation.py
+++ b/openllm-python/src/openllm/_generation.py
@@ -3,8 +3,10 @@ from __future__ import annotations
 import typing as t, transformers
 if t.TYPE_CHECKING: import torch, openllm

+# reexport from transformers
 LogitsProcessorList = transformers.LogitsProcessorList
 StoppingCriteriaList = transformers.StoppingCriteriaList
+
 class StopSequenceCriteria(transformers.StoppingCriteria):
  def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast):
    if isinstance(stop_sequences, str): stop_sequences = [stop_sequences]
@@ -20,3 +22,18 @@ def prepare_logits_processor(config: openllm.LLMConfig) -> transformers.LogitsPr
  if 1e-8 <= generation_config["top_p"]: logits_processor.append(transformers.TopPLogitsWarper(generation_config["top_p"]))
  if generation_config["top_k"] > 0: logits_processor.append(transformers.TopKLogitsWarper(generation_config["top_k"]))
  return logits_processor
+
+# NOTE: The ordering here is important. Some models have two of these and we have a preference for which value gets used.
+SEQLEN_KEYS = ["max_sequence_length", "seq_length", "max_position_embeddings", "max_seq_len", "model_max_length"]
+def get_context_length(config: transformers.PretrainedConfig) -> int:
+  rope_scaling = getattr(config, "rope_scaling", None)
+  rope_scaling_factor = config.rope_scaling["factor"] if rope_scaling else 1.0
+  for key in SEQLEN_KEYS:
+    if getattr(config, key, None) is not None: return int(rope_scaling_factor*getattr(config,key))
+  return 2048
+def is_sentence_complete(output: str) -> bool: return output.endswith((".", "?", "!", "...", "。", "?", "!", "…", '"', "'", "”"))
+def is_partial_stop(output: str, stop_str: str) -> bool:
+    """Check whether the output contains a partial stop str."""
+    for i in range(0, min(len(output), len(stop_str))):
+        if stop_str.startswith(output[-i:]): return True
+    return False
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import functools, inspect, logging, os, re, traceback, types, typing as t, uuid
 from abc import ABC, abstractmethod
 from pathlib import Path
-import attr, fs.path, inflection, orjson, bentoml, openllm
+import attr, fs.path, inflection, orjson, bentoml, openllm, gc
 from huggingface_hub import hf_hub_download
 from bentoml._internal.models.model import ModelSignature

@@ -152,7 +152,7 @@ class LLMInterface(ABC, t.Generic[M, T]):
    """
    raise NotImplementedError

-  def generate_iterator(self, prompt: str, **attrs: t.Any) -> t.Iterator[t.Any]:
+  def generate_iterator(self, prompt: str, /, **attrs: t.Any) -> t.Iterator[t.Any]:
    """The iterator version of `generate` function."""
    raise NotImplementedError("Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.")

@@ -605,8 +605,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
    return f"{tag_name}:{model_version}"

  @classmethod
-  def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag:
-    return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))
+  def generate_tag(cls, *param_decls: t.Any, **attrs: t.Any) -> bentoml.Tag: return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs))

  def __init__(
      self, *args: t.Any, model_id: str, llm_config: LLMConfig, bettertransformer: bool | None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, _quantize_method: t.Literal["int8", "int4", "gptq"] | None, _runtime: t.Literal["ggml", "transformers"], _model_version: str,
@@ -711,7 +710,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
    self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)

    # handle trust_remote_code
-    self.__llm_trust_remote_code__ = self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"])
+    _from_env = os.getenv("TRUST_REMOTE_CODE", None)
+    self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]))

    self.llm_post_init()
    # we set it here so that we allow subclass to overwrite bettertransformer in llm_post_init
@@ -725,55 +725,30 @@ class LLM(LLMInterface[M, T], ReprMixin):
    super().__setattr__(attr, value)

  @property
-  def adapters_mapping(self) -> AdaptersMapping | None:
-    return self._adapters_mapping
-
+  def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping
  @adapters_mapping.setter
-  def adapters_mapping(self, value: AdaptersMapping) -> None:
-    self._adapters_mapping = value
-
+  def adapters_mapping(self, value: AdaptersMapping) -> None: self._adapters_mapping = value
  @property
-  def __repr_keys__(self) -> set[str]:
-    return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"}
-
+  def __repr_keys__(self) -> set[str]: return {"model_id", "runner_name", "config", "adapters_mapping", "runtime", "tag"}
  def __repr_args__(self) -> ReprArgs:
    for k in self.__repr_keys__:
      if k == "config": yield k, self.config.model_dump(flatten=True)
      else: yield k, getattr(self, k)
-
  @property
-  def model_id(self) -> str:
-    return self._model_id
-
+  def model_id(self) -> str: return self._model_id
  @property
-  def runtime(self) -> t.Literal["ggml", "transformers"]:
-    return self._runtime
-
+  def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime
  @property
-  def runner_name(self) -> str:
-    return f"llm-{self.config['start_name']}-runner"
-
-  # NOTE: The section below defines a loose contract with langchain's LLM interface.
+  def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner"
  @property
-  def llm_type(self) -> str:
-    return normalise_model_name(self._model_id)
-
+  def llm_type(self) -> str: return normalise_model_name(self._model_id)
  @property
-  def identifying_params(self) -> DictStrAny:
-    return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()}
-
+  def identifying_params(self) -> DictStrAny: return {"configuration": self.config.model_dump_json().decode(), "model_ids": orjson.dumps(self.config["model_ids"]).decode()}
  @property
-  def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]:
-    return (self._model_decls, self._model_attrs), self._tokenizer_attrs
-
+  def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: return (self._model_decls, self._model_attrs), self._tokenizer_attrs
  @property
-  def tag(self) -> bentoml.Tag:
-    return self._tag
-
-  # ensure_model_id_exists can be called to save the model to local store
-  def ensure_model_id_exists(self) -> bentoml.Model:
-    from . import import_model
-    return import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format)
+  def tag(self) -> bentoml.Tag: return self._tag
+  def ensure_model_id_exists(self) -> bentoml.Model: return openllm.import_model(self.config["start_name"], model_id=self.model_id, model_version=self._model_version, runtime=self.runtime, implementation=self.__llm_implementation__, quantize=self._quantize_method, serialisation_format=self._serialisation_format)

  @property
  def _bentomodel(self) -> bentoml.Model:
@@ -916,10 +891,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
    """
    models = models if models is not None else []

-    try:
-      models.append(self._bentomodel)
-    except bentoml.exceptions.NotFound as err:
-      raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None
+    try: models.append(self._bentomodel)
+    except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None

    if scheduling_strategy is None:
      from ._strategies import CascadingResourceStrategy
@@ -927,7 +900,7 @@ class LLM(LLMInterface[M, T], ReprMixin):

    generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
    embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
-    generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True)))
+    generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))

    # NOTE: returning the two langchain API's to the runner
    return llm_runner_class(self)(
@@ -936,9 +909,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
    )

  # NOTE: Scikit API
-  def predict(self, prompt: str, **attrs: t.Any) -> t.Any:
-    return self.__call__(prompt, **attrs)
-
+  def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return self(prompt, **attrs)
  def __call__(self, prompt: str, **attrs: t.Any) -> t.Any:
    """Returns the generation result and format the result.

@@ -956,7 +927,95 @@ class LLM(LLMInterface[M, T], ReprMixin):
    prompt, generate_kwargs, postprocess_kwargs = self.sanitize_parameters(prompt, **attrs)
    return self.postprocess_generate(prompt, self.generate(prompt, **generate_kwargs), **postprocess_kwargs)

-# fmt: off
+  def generate(self, prompt: str, **attrs: t.Any) -> t.Any:
+    # TODO: support different generation strategies, similar to self.model.generate
+    for it in self.generate_iterator(prompt, **attrs): pass
+    return it
+
+  def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> str:
+    if isinstance(generation_result, dict): return generation_result["text"]
+    return generation_result
+
+  def generate_iterator(self, prompt: str, /,
+                        *, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]:
+    # NOTE: encoder-decoder models will need to implement their own generate_iterator for now
+    # inspired from fastchat's generate_stream_func
+    from ._generation import prepare_logits_processor, get_context_length, is_partial_stop
+
+    len_prompt = len(prompt)
+    if stop_token_ids is None: stop_token_ids = []
+    stop_token_ids.append(self.tokenizer.eos_token_id)
+
+    logits_processor = prepare_logits_processor(self.config)
+
+    input_ids = self.tokenizer(prompt).input_ids
+
+    if context_length is None: context_length = get_context_length(self.model.config)
+    max_src_len = context_length - self.config["max_new_tokens"] - 1
+
+    input_ids = input_ids[-max_src_len:]
+    output_ids = list(input_ids)
+    input_echo_len = len(input_ids)
+
+    past_key_values = out = token = None
+    for i in range(self.config["max_new_tokens"]):
+      if i == 0:  # prefill
+        out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
+      else:  # decoding
+        out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), use_cache=True, past_key_values=past_key_values) # type: ignore[has-type]
+      logits = out.logits
+      past_key_values = out.past_key_values
+
+      last_token_logits = logits_processor(torch.as_tensor([output_ids], device=logits.device) if self.config["repetition_penalty"] > 1.0 else None, logits[:, -1, :])[0] if logits_processor else logits[0, -1, :]
+      # Switch to CPU by avoiding some bugs in mps backend.
+      if self.device.type == "mps": last_token_logits = last_token_logits.float().to("cpu")
+
+      if self.config["temperature"] < 1e-5 or self.config["top_p"] < 1e-8: token = int(torch.argmax(last_token_logits))  # greedy
+      else: token = int(torch.multinomial(torch.softmax(last_token_logits, dim=-1), num_samples=1))
+      output_ids.append(token)
+
+      if token in stop_token_ids: stopped = True
+      else: stopped = False
+
+      # Yield the output tokens
+      if i % stream_interval == 0 or i == self.config["max_new_tokens"] - 1 or stopped:
+        tmp_output_ids = output_ids if echo else output_ids[input_echo_len:]
+        rfind_start = len_prompt if echo else 0
+        output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
+
+        partially_stopped = False
+        if stop:
+          if isinstance(stop, str):
+            pos = output.rfind(stop, rfind_start)
+            if pos != -1: output, stopped = output[:pos], True
+            else: partially_stopped = is_partial_stop(output, stop)
+          elif isinstance(stop, t.Iterable):
+            for each_stop in stop:
+              pos = output.rfind(each_stop, rfind_start)
+              if pos != -1:
+                output, stopped = output[:pos], True
+                break
+              else:
+                partially_stopped = is_partial_stop(output, each_stop)
+                if partially_stopped: break
+          else: raise ValueError("Invalid stop field type.")
+
+        # Prevent yielding partial stop sequence
+        if not partially_stopped:
+          yield {"text": output, "usage": {"prompt_tokens": input_echo_len, "completion_tokens": i, "total_tokens": input_echo_len + i}, "finish_reason": None}
+      if stopped: break
+
+    # Finish stream event, which contains finish reason
+    if i == self.config["max_new_tokens"] - 1: finish_reason = "length"
+    elif stopped: finish_reason = "stop"
+    else: finish_reason = None
+    yield {"text": output, "usage": {"prompt_tokens": input_echo_len, "completion_tokens": i, "total_tokens": input_echo_len + i}, "finish_reason": finish_reason}
+
+    # Clean
+    del past_key_values, out
+    gc.collect()
+    torch.cuda.empty_cache()
+
@overload
 def Runner(model_name: str, *, model_id: str | None = None, model_version: str | None = ..., init_local: t.Literal[False, True] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
@@ -965,8 +1024,6 @@ def Runner(model_name: str, *, model_id: str = ..., model_version: str | None =
 def Runner(model_name: str, *, ensure_available: bool | None = None, init_local: bool = ..., implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
@overload
 def Runner(model_name: str, *, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4", "gptq"] | None = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLMRunner[t.Any, t.Any]: ...
-# fmt: on
-
 def Runner(model_name: str, ensure_available: bool | None = None, init_local: bool = False, implementation: LiteralRuntime | None = None, llm_config: LLMConfig | None = None, **attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
  """Create a Runner for given LLM. For a list of currently supported LLM, check out 'openllm models'.

@@ -1008,9 +1065,7 @@ def Runner(model_name: str, ensure_available: bool | None = None, init_local: bo
  if init_local: runner.init_local(quiet=True)
  return runner

-def method_signature(sig: ModelSignature) -> ModelSignatureDict:
-  return bentoml_cattr.unstructure(sig)
-
+def method_signature(sig: ModelSignature) -> ModelSignatureDict: return bentoml_cattr.unstructure(sig)
 class SetAdapterOutput(t.TypedDict):
  success: bool
  message: str
@@ -1019,7 +1074,6 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
  class _Runnable(bentoml.Runnable):
    SUPPORTED_RESOURCES = ("nvidia.com/gpu", "amd.com/gpu", "cpu")
    SUPPORTS_CPU_MULTI_THREADING = True
-
    def __init__(__self: _Runnable):
      # NOTE: The side effect of this line
      # is that it will load the imported model during
@@ -1028,40 +1082,41 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate
      if self.adapters_mapping is not None:
        logger.info("Applying LoRA to %s...", self.runner_name)
        self.apply_adapter(inference_mode=True, load_adapters="all")
-
    def set_adapter(__self: _Runnable, adapter_name: str) -> None:
      if self.__llm_adapter_map__ is None: raise ValueError("No adapters available for current running server.")
      elif not isinstance(self.model, peft.PeftModel): raise RuntimeError("Model is not a PeftModel")
      if adapter_name != "default": self.model.set_adapter(adapter_name)
      logger.info("Successfully apply LoRA layer %s", adapter_name)
-
    @bentoml.Runnable.method(**method_signature(embeddings_sig))
-    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]:
-      return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
-
+    def embeddings(__self: _Runnable, prompt: str | list[str]) -> t.Sequence[LLMEmbeddings]: return [self.embeddings([prompt] if isinstance(prompt, str) else prompt)]
    @bentoml.Runnable.method(**method_signature(generate_sig))
    def __call__(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
      adapter_name = attrs.pop("adapter_name", None)
      if adapter_name is not None: __self.set_adapter(adapter_name)
      return self.generate(prompt, **attrs)
-
    @bentoml.Runnable.method(**method_signature(generate_sig))
    def generate(__self: _Runnable, prompt: str, **attrs: t.Any) -> list[t.Any]:
      adapter_name = attrs.pop("adapter_name", None)
      if adapter_name is not None: __self.set_adapter(adapter_name)
      return self.generate(prompt, **attrs)
-
    @bentoml.Runnable.method(**method_signature(generate_sig))
    def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal["generated_text"], str]]:
      adapter_name = attrs.pop("adapter_name", None)
      if adapter_name is not None: __self.set_adapter(adapter_name)
      return self.generate_one(prompt, stop, **attrs)
-
    @bentoml.Runnable.method(**method_signature(generate_iterator_sig))
-    def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[t.Any, None, None]:
+    def generate_iterator(__self: _Runnable, prompt: str, **attrs: t.Any) -> t.Generator[str, None, str]:
      adapter_name = attrs.pop("adapter_name", None)
      if adapter_name is not None: __self.set_adapter(adapter_name)
-      yield self.generate_iterator(prompt, **attrs)
+      pre = 0
+      for outputs in self.generate_iterator(prompt, **attrs):
+        output_text = outputs["text"].strip().split(" ")
+        now = len(output_text) - 1
+        if now > pre:
+          yield " ".join(output_text[pre:now])
+          pre = now
+      yield " ".join(output_text[pre:])
+      return " ".join(output_text)

  return types.new_class(self.__class__.__name__ + "Runnable", (_Runnable,), {}, lambda ns: ns.update({"SUPPORTED_RESOURCES": ("nvidia.com/gpu", "amd.com/gpu") if self.config["requires_gpu"] else ("nvidia.com/gpu", "amd.com/gpu", "cpu"), "__module__": self.__module__, "__doc__": self.config["env"].start_docstring}))

@@ -1098,9 +1153,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
    """
    return __self.embeddings.run([prompt] if isinstance(prompt, str) else prompt)

-  def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]:
-    return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"}
-
+  def _wrapped_repr_keys(_: LLMRunner[M, T]) -> set[str]: return {"config", "llm_type", "runner_methods", "runtime", "llm_tag"}
  def _wrapped_repr_args(__self: LLMRunner[M, T]) -> ReprArgs:
    yield "runner_methods", {method.name: {"batchable": method.config.batchable, "batch_dim": method.config.batch_dim if method.config.batchable else None} for method in __self.runner_methods}
    yield "config", self.config.model_dump(flatten=True)
@@ -1108,13 +1161,8 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
    yield "runtime", self.runtime
    yield "llm_tag", self.tag

-  return types.new_class(
-      self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({
-          "llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self,  # NOTE: self reference to LLM
-          "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters), "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__, "__repr_keys__": property(
-              _wrapped_repr_keys
-          ), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None,
-      }),
-  )
+  return types.new_class(self.__class__.__name__ + "Runner", (bentoml.Runner,), exec_body=lambda ns: ns.update({"llm_type": self.llm_type, "identifying_params": self.identifying_params, "llm_tag": self.tag, "llm": self, "config": self.config, "implementation": self.__llm_implementation__, "peft_adapters": property(fget=available_adapters),
+                                                                                                                "download_model": self.ensure_model_id_exists, "__call__": _wrapped_generate_run, "embed": _wrapped_embeddings_run, "__module__": self.__module__, "__doc__": self.config["env"].start_docstring, "__repr__": ReprMixin.__repr__,
+                                                                                                                "__repr_keys__": property( _wrapped_repr_keys), "__repr_args__": _wrapped_repr_args, "supports_embeddings": self["supports_embeddings"], "supports_hf_agent": self["supports_generate_one"], "has_adapters": self._adapters_mapping is not None}))

 __all__ = ["LLMRunner", "LLMRunnable", "Runner", "LLM", "llm_runner_class", "llm_runnable_class", "LLMEmbeddings"]
--- a/openllm-python/src/openllm/_service.py
+++ b/openllm-python/src/openllm/_service.py
@@ -19,14 +19,20 @@ generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, name
 runners: t.Sequence[bentoml.Runner] = [runner]
 if not runner.supports_embeddings: runners.append(generic_embedding_runner)
 svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners)
+_JsonInput=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True), "adapter_name": ""})

-@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
+@svc.api(route="/v1/generate", input=_JsonInput, output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  config = qa_inputs.llm_config.model_dump()
  responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
  return openllm.GenerationOutput(responses=responses, configuration=config)

+@svc.api(route="/v1/generate_stream", input=_JsonInput,output=bentoml.io.Text(content_type="text/event_stream"))
+async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]:
+  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
+  return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, **qa_inputs.llm_config.model_dump())
+
@svc.api(route="/v1/metadata", input=bentoml.io.Text(), output=bentoml.io.JSON.from_sample({"model_id": runner.llm.model_id, "timeout": 3600, "model_name": llm_config["model_name"], "framework": "pt", "configuration": "", "supports_embeddings": runner.supports_embeddings, "supports_hf_agent": runner.supports_hf_agent}))
 def metadata_v1(_: str) -> openllm.MetadataOutput:
  return openllm.MetadataOutput(timeout=llm_config["timeout"], model_name=llm_config["model_name"], framework=llm_config["env"]["framework_value"], model_id=runner.llm.model_id, configuration=llm_config.model_dump_json().decode(), supports_embeddings=runner.supports_embeddings, supports_hf_agent=runner.supports_hf_agent)
--- a/openllm-python/src/openllm/_typing_compat.py
+++ b/openllm-python/src/openllm/_typing_compat.py
@@ -69,7 +69,7 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
  embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], LLMEmbeddings]
  generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
  generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
-  generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
+  generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]

 class LLMRunner(bentoml.Runner, t.Generic[M, T]):
  __doc__: str
@@ -85,7 +85,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
  embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[LLMEmbeddings]]
  generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
  generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal["generated_text"], str]]]
-  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[t.Any, None, None]]
+  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
  def __init__(self, runnable_class: type[LLMRunnable[M, T]], *, runnable_init_params: dict[str, t.Any] | None = ..., name: str | None = ..., scheduling_strategy: type[Strategy] = ..., models: list[bentoml.Model] | None = ..., max_batch_size: int | None = ..., max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False,) -> None: ...
  def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: ...
  @abc.abstractmethod
--- a/openllm-python/src/openllm/models/llama/modeling_llama.py
+++ b/openllm-python/src/openllm/models/llama/modeling_llama.py
@@ -11,9 +11,6 @@ class Llama(openllm.LLM["transformers.LlamaForCausalLM", "transformers.LlamaToke
  @property
  def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
  def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
-  def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]
-  def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
-    with torch.inference_mode(): return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors="pt").to(self.device), generation_config=self.config.model_construct_env(**attrs).to_generation_config(), do_sample=True, stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()])), skip_special_tokens=True, clean_up_tokenization_spaces=True)
  def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
    encoding = self.tokenizer(prompts, padding=True, return_tensors="pt").to(self.device)
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
--- a/openllm-python/src/openllm/serialisation/transformers/init.py
+++ b/openllm-python/src/openllm/serialisation/transformers/init.py
@@ -128,7 +128,7 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
  """
  try:
    model = bentoml.models.get(llm.tag)
-    if model.info.module not in ("openllm.serialisation.transformers", __name__):
+    if model.info.module not in ("openllm.serialisation.transformers" "bentoml.transformers", "bentoml._internal.frameworks.transformers", __name__):  # NOTE: backward compatible with previous version of OpenLLM.
      raise bentoml.exceptions.NotFound(f"Model {model.tag} was saved with module {model.info.module}, not loading with 'openllm.serialisation.transformers'.")
    if "runtime" in model.info.labels and model.info.labels["runtime"] != llm.runtime:
      raise openllm.exceptions.OpenLLMException(f"Model {model.tag} was saved with runtime {model.info.labels['runtime']}, not loading with {llm.runtime}.")