fix(perf): respect per request information

remove use_default_prompt_template options add pretrained to list of start help docstring fix flax generation config improve flax and tensorflow implementation Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-03-10 11:10:27 -04:00 · 2023-06-10 02:14:13 -04:00
parent e90d90e9a0
commit afddaed08c
17 changed files with 146 additions and 111 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,6 @@ classifiers = [
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: Implementation :: CPython",
@@ -71,7 +69,7 @@ all = ["openllm[fine-tune]", "openllm[flan-t5]", "openllm[chatglm]", "openllm[st
 chatglm = ["cpm_kernels", "sentencepiece"]
 falcon = ["einops", "xformers", "safetensors"]
 fine-tune = ["peft", "bitsandbytes", "datasets", "accelerate"]
-flan-t5 = ["flax", "jax", "jaxlib", "tensorflow"]
+flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
 starcoder = ["bitsandbytes"]

 [project.urls]
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -48,6 +48,7 @@ import inflection
 import orjson
 from cattr.gen import make_dict_unstructure_fn, override
 from click_option_group import optgroup
+from deepmerge.merger import Merger

 import openllm

@@ -83,6 +84,15 @@ __all__ = ["LLMConfig"]

 logger = logging.getLogger(__name__)

+config_merger = Merger(
+    # merge dicts
+    type_strategies=[(DictStrAny, "merge")],
+    # override all other types
+    fallback_strategies=["override"],
+    # override conflicting types
+    type_conflict_strategies=["override"],
+)
+

@t.overload
 def attrs_to_options(
@@ -593,13 +603,9 @@ def _make_internal_generation_class(cls: type[LLMConfig]) -> type[GenerationConf
    return generated_cls


-USE_DEFAULT_PROMPT_TEMPLATE_DOCSTRING = """\
-Whether a model should use their default prompt template setup. This is useful if
-users wants to do some prompt engineering. Default to True.
-"""
-
-# NOTE: This DEFAULT_KEYMAP is a way to dynamically generate attr.field
-DEFAULT_LLMCONFIG_ATTRS = (("use_default_prompt_template", True, USE_DEFAULT_PROMPT_TEMPLATE_DOCSTRING, bool),)
+# NOTE: This DEFAULT_LLMCONFIG_ATTRS is a way to dynamically generate attr.field
+# and will be saved for future use in LLMConfig if we have some shared config.
+DEFAULT_LLMCONFIG_ATTRS: tuple[tuple[str, t.Any, str, type[t.Any]], ...] = ()


@attr.define
@@ -652,6 +658,9 @@ class LLMConfig:
        __openllm_url__: str = Field(None, init=False)
        """The resolved url for this LLMConfig."""

+        __openllm_accepted_keys__: set[str] = Field(None, init=False)
+        """The accepted keys for this LLMConfig."""
+
        __openllm_requirements__: list[str] | None = None
        """The default PyPI requirements needed to run this given LLM. By default, we will depend on
        bentoml, torch, transformers."""
@@ -674,10 +683,6 @@ class LLMConfig:
        """The result generated GenerationConfig class for this LLMConfig. This will be used
        to create the generation_config argument that can be used throughout the lifecycle."""

-        # NOTE: The following can be shared accross all LLMConfig subclasses.
-        use_default_prompt_template: bool = Field(True, init=False)
-        use_default_prompt_template.__doc__ = USE_DEFAULT_PROMPT_TEMPLATE_DOCSTRING
-
    def __init_subclass__(
        cls,
        *,
@@ -757,11 +762,15 @@ class LLMConfig:
        cls.__openllm_attrs__ = tuple(a.name for a in own_attrs)

        # NOTE: Enable some default attributes that can be shared across all LLMConfig
-        base_attrs = [
-            attr.Attribute.from_counting_attr(k, cls.Field(default, env=field_env_key(k), description=docs), hints)
-            for k, default, docs, hints in DEFAULT_LLMCONFIG_ATTRS
-            if k not in cls.__openllm_attrs__
-        ] + base_attrs
+        if len(DEFAULT_LLMCONFIG_ATTRS) > 0:
+            # NOTE: update the hints for default variables we dynamically added.
+            hints.update({k: hints for k, _, _, hints in DEFAULT_LLMCONFIG_ATTRS})
+            base_attrs = [
+                attr.Attribute.from_counting_attr(k, cls.Field(default, env=field_env_key(k), description=docs), hints)
+                for k, default, docs, hints in DEFAULT_LLMCONFIG_ATTRS
+                if k not in cls.__openllm_attrs__
+            ] + base_attrs
+
        attrs: list[attr.Attribute[t.Any]] = own_attrs + base_attrs

        # Mandatory vs non-mandatory attr order only matters when they are part of
@@ -817,10 +826,10 @@ class LLMConfig:

        hints.update(t.get_type_hints(cls.generation_class))

-        # NOTE: update the hints for default variables we dynamically added.
-        hints.update({k: hints for k, _, _, hints in DEFAULT_LLMCONFIG_ATTRS})
        cls.__openllm_hints__ = hints

+        cls.__openllm_accepted_keys__ = set(cls.__openllm_attrs__) | set(attr.fields_dict(cls.generation_class))
+
    @property
    def name_type(self) -> t.Literal["dasherize", "lowercase"]:
        return self.__openllm_name_type__
@@ -832,8 +841,10 @@ class LLMConfig:
        __openllm_extras__: dict[str, t.Any] | None = None,
        **attrs: t.Any,
    ):
-        to_exclude = list(attr.fields_dict(self.generation_class)) + list(self.__openllm_attrs__)
-        self.__openllm_extras__ = __openllm_extras__ or {k: v for k, v in attrs.items() if k not in to_exclude}
+        self.__openllm_extras__ = openllm.utils.first_not_none(__openllm_extras__, default={})
+        config_merger.merge(
+            self.__openllm_extras__, {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__}
+        )

        attrs = {k: v for k, v in attrs.items() if k not in self.__openllm_extras__ and v is not None}

@@ -844,9 +855,11 @@ class LLMConfig:

        attrs = {k: v for k, v in attrs.items() if k not in generation_config}

-        extras = set(attrs).difference(set(attr.fields_dict(self.__class__)))
+        self.__attrs_init__(**{k: v for k, v in attrs.items() if k in self.__openllm_attrs__})

-        self.__attrs_init__(**{k: v for k, v in attrs.items() if k not in extras})
+        # The rest update to extras
+        attrs = {k: v for k, v in attrs.items() if k not in self.__openllm_attrs__}
+        config_merger.merge(self.__openllm_extras__, attrs)

    def __repr__(self) -> str:
        bases = f"{self.__class__.__qualname__.rsplit('>.', 1)[-1]}(generation_config={repr(self.generation_class())}"
@@ -897,35 +910,35 @@ class LLMConfig:
        return orjson.dumps(self.model_dump(**kwargs))

    @classmethod
-    def model_construct_env(cls, __llm_config__: LLMConfig | None = None, **attrs: t.Any) -> LLMConfig:
+    def model_construct_env(cls, **attrs: t.Any) -> LLMConfig:
        """A helpers that respect configuration values that
        sets from environment variables for any given configuration class.
        """
-        # NOTE: filter out None values
        attrs = {k: v for k, v in attrs.items() if v is not None}
-        if "generation_config" in attrs:
-            # NOTE: We will need to flatten the attrs dict
-            generation_config = attrs.pop("generation_config", {})
-            attrs.update(generation_config)

-        env = ModelEnv(cls.__openllm_model_name__)
+        model_config = ModelEnv(cls.__openllm_model_name__).model_config

-        env_json_string = os.environ.get(env.model_config, None)
+        env_json_string = os.environ.get(model_config, None)

        if env_json_string is not None:
            try:
                config_from_env = orjson.loads(env_json_string)
            except orjson.JSONDecodeError as e:
-                raise RuntimeError(f"Failed to parse '{env.model_config}' as valid JSON string.") from e
-            config_from_env.update(attrs)
-            return bentoml_cattr.structure(config_from_env, cls)
+                raise RuntimeError(f"Failed to parse '{model_config}' as valid JSON string.") from e
+            ncls = bentoml_cattr.structure(config_from_env, cls)
+        else:
+            ncls = cls()

-        if __llm_config__ is not None:
-            # NOTE: We only hit this branch on server-side, to ensure per-request configuration
-            # is respected.
-            attrs.update(__llm_config__.model_dump(flatten=True))
+        if "generation_config" in attrs:
+            generation_config = attrs.pop("generation_config")
+            if not LazyType(DictStrAny).isinstance(generation_config):
+                raise RuntimeError(f"Expected a dictionary, but got {type(generation_config)}")
+        else:
+            generation_config = {k: v for k, v in attrs.items() if k in attr.fields_dict(ncls.generation_class)}

-        return bentoml_cattr.structure(attrs, cls)
+        attrs = {k: v for k, v in attrs.items() if k not in generation_config}
+        ncls.generation_config = attr.evolve(ncls.generation_config, **generation_config)
+        return attr.evolve(ncls, **attrs)

    def model_validate_click(self, **attrs: t.Any) -> tuple[LLMConfig, dict[str, t.Any]]:
        """Parse given click attributes into a LLMConfig and return the remaining click attributes."""
@@ -1013,12 +1026,14 @@ def structure_llm_config(data: dict[str, t.Any], cls: type[LLMConfig]) -> LLMCon
        raise RuntimeError(f"Expected a dictionary, but got {type(data)}")

    cls_attrs = {k: v for k, v in data.items() if k in cls.__openllm_attrs__}
+    generation_cls_fields = attr.fields_dict(cls.generation_class)
    if "generation_config" in data:
        generation_config = data.pop("generation_config")
        if not LazyType(DictStrAny).isinstance(generation_config):
            raise RuntimeError(f"Expected a dictionary, but got {type(generation_config)}")
+        config_merger.merge(generation_config, {k: v for k, v in data.items() if k in generation_cls_fields})
    else:
-        generation_config = {k: v for k, v in data.items() if k in attr.fields_dict(cls.generation_class)}
+        generation_config = {k: v for k, v in data.items() if k in generation_cls_fields}
    not_extras = list(cls_attrs) + list(generation_config)
    # The rest should be passed to extras
    data = {k: v for k, v in data.items() if k not in not_extras}
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -230,11 +230,7 @@ class LLMInterface(ABC):

        It takes a prompt that is given by the user, attrs that can be parsed with the prompt.

-        NOTE: the attrs should also handle the following default attributes from all LLMConfig:
-        - use_default_prompt_template
-
        Returns a tuple of three items:
-        - The processed prompt text depending on `use_default_prompt_template`
        - The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
        - The attributes dictionary that will be passed into `self.postprocess_generate`.
        """
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -39,14 +39,14 @@ svc = bentoml.Service(name=f"llm-{llm_config.__openllm_start_name__}-service", r


@svc.api(
-    input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": {}}),
-    output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": {}}),
+    input=bentoml.io.JSON.from_sample(sample={"prompt": "", "llm_config": llm_config.model_dump()}),
+    output=bentoml.io.JSON.from_sample(sample={"responses": [], "configuration": llm_config.model_dump()}),
    route="/v1/generate",
 )
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
-    qa = openllm.GenerationInput.for_model(model)(**input_dict)
-    config = llm_config.model_construct_env(__llm_config__=qa.llm_config).model_dump()
-    responses = await runner.generate.async_run(qa.prompt, **config)
+    qa_inputs = openllm.GenerationInput.for_model(model)(**input_dict)
+    config = qa_inputs.llm_config.model_dump()
+    responses = await runner.generate.async_run(qa_inputs.prompt, **config)
    return openllm.GenerationOutput(responses=responses, configuration=config)


--- a/src/openllm/cli.py
+++ b/src/openllm/cli.py
@@ -314,15 +314,20 @@ def start_model_command(
    configure_logging()

    ModelEnv = openllm.utils.ModelEnv(model_name)
+    llm_config = openllm.AutoConfig.for_model(model_name)
+
+    docstring = f"""\
+{ModelEnv.start_docstring}
+\b
+The available pretrained models to use with '{model_name}' are: {openllm.AutoLLM.for_model(model_name).pretrained}
+"""
    command_attrs: dict[str, t.Any] = {
        "name": ModelEnv.model_name,
        "context_settings": _context_settings or {},
        "short_help": f"Start a LLMServer for '{model_name}' ('--help' for more details)",
-        "help": ModelEnv.start_docstring,
+        "help": docstring,
    }

-    llm_config = openllm.AutoConfig.for_model(model_name)
-
    aliases: list[str] = []
    if llm_config.name_type == "dasherize":
        aliases.append(llm_config.__openllm_start_name__)
--- a/src/openllm/models/chatglm/modeling_chatglm.py
+++ b/src/openllm/models/chatglm/modeling_chatglm.py
@@ -110,7 +110,6 @@ class ChatGLM(openllm.LLM):
            "num_beams": num_beams,
            "top_p": top_p,
            "temperature": temperature,
-            "use_default_prompt_template": use_default_prompt_template,
            **attrs,
        }

--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -42,7 +42,6 @@ class DollyV2Config(
    return_full_text: bool = openllm.LLMConfig.Field(
        False, description="Whether to return the full prompt to the users."
    )
-    use_default_prompt_template: bool = False

    class GenerationConfig:
        temperature: float = 0.9
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -33,6 +33,9 @@ logger = logging.getLogger(__name__)


 class DollyV2(openllm.LLM):
+    if t.TYPE_CHECKING:
+        config: openllm.DollyV2Config
+
    __openllm_internal__ = True

    default_model = "databricks/dolly-v2-3b"
@@ -58,12 +61,20 @@ class DollyV2(openllm.LLM):
            torch_dtype=torch_dtype,
            device_map=device_map,
        )
-        return bentoml.transformers.save_model(
-            tag,
-            pipeline,
-            custom_objects={"tokenizer": tokenizer},
-            external_modules=[importlib.import_module(pipeline.__module__)],
-        )
+        try:
+            return bentoml.transformers.save_model(
+                tag,
+                pipeline,
+                custom_objects={"tokenizer": tokenizer},
+                external_modules=[importlib.import_module(pipeline.__module__)],
+            )
+        finally:
+            import gc
+
+            gc.collect()
+
+            if openllm.utils.is_torch_available() and torch.cuda.is_available():
+                torch.cuda.empty_cache()

    def sanitize_parameters(
        self,
@@ -72,39 +83,37 @@ class DollyV2(openllm.LLM):
        temperature: float | None = None,
        top_k: int | None = None,
        top_p: float | None = None,
-        use_default_prompt_template: bool = False,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
-        if use_default_prompt_template:
-            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)
-        else:
-            prompt_text = prompt
-
        # NOTE: The rest of attrs should be kwargs for GenerationConfig
        generate_kwargs = {
            "max_new_tokens": max_new_tokens,
            "top_k": top_k,
            "top_p": top_p,
            "temperature": temperature,
-            "use_default_prompt_template": use_default_prompt_template,
            **attrs,
        }

-        return prompt_text, generate_kwargs, {}
+        return prompt, generate_kwargs, {}

-    def postprocess_generate(self, prompt: str, generation_result: str, **_: t.Any) -> str:
-        return generation_result
+    def postprocess_generate(
+        self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any
+    ) -> str:
+        return generation_result[0]["generated_text"]

    @torch.inference_mode()
-    def generate(self, prompt: str, **attrs: t.Any) -> str:
+    def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
        self.model.tokenizer = self.tokenizer
-        llm_config: openllm.DollyV2Config = self.config.model_construct_env(**attrs)
-        decoded = self.model(prompt, generation_config=llm_config.to_generation_config())
+        llm_config = self.config.model_construct_env(**attrs)
+        decoded: list[dict[t.Literal["generated_text"], str]] = self.model(
+            prompt, generation_config=llm_config.to_generation_config()
+        )

-        # If the full text is requested, then append the decoded text to the original instruction.
-        # This technically isn't the full text, as we format the instruction in the prompt the model has been
-        # trained on, but to the client it will appear to be the full text.
        if llm_config.return_full_text:
-            decoded = f"{DEFAULT_PROMPT_TEMPLATE.format(prompt)}\n{decoded}"
+            return [
+                {k: f"{DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt)}\n{generated}"}
+                for i in decoded
+                for k, generated in i.items()
+            ]

        return decoded
--- a/src/openllm/models/falcon/configuration_falcon.py
+++ b/src/openllm/models/falcon/configuration_falcon.py
@@ -32,8 +32,6 @@ class FalconConfig(
    Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
    """

-    use_default_prompt_template: bool = False
-
    class GenerationConfig:
        max_new_tokens: int = 200
        top_k: int = 10
--- a/src/openllm/models/falcon/modeling_falcon.py
+++ b/src/openllm/models/falcon/modeling_falcon.py
@@ -88,17 +88,20 @@ class Falcon(openllm.LLM):
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
-            prompt_variables = {
-                k: v
-                for k, v in attrs.items()
-                if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
-            }
+            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
+            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
            if "instruction" in prompt_variables:
                raise RuntimeError(
                    "'instruction' should be passed as the first argument instead of "
                    "kwargs when 'use_default_prompt_template=True'"
                )
-            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            try:
+                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e:
+                raise RuntimeError(
+                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
+                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
+                )
        else:
            prompt_text = prompt

--- a/src/openllm/models/flan_t5/modeling_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flan_t5.py
@@ -53,17 +53,20 @@ class FlanT5(openllm.LLM):
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
-            prompt_variables = {
-                k: v
-                for k, v in attrs.items()
-                if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
-            }
+            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
+            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
            if "instruction" in prompt_variables:
                raise RuntimeError(
                    "'instruction' should be passed as the first argument "
                    "instead of kwargs when 'use_default_prompt_template=True'"
                )
-            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            try:
+                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e:
+                raise RuntimeError(
+                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
+                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
+                )
        else:
            prompt_text = prompt

--- a/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
+++ b/src/openllm/models/flan_t5/modeling_flax_flan_t5.py
@@ -42,30 +42,38 @@ class FlaxFlanT5(openllm.LLM):
        top_k: int | None = None,
        top_p: float | None = None,
        repetition_penalty: float | None = None,
+        decoder_start_token_id: int | None = None,
        use_default_prompt_template: bool = True,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        if use_default_prompt_template:
-            prompt_variables = {
-                k: v
-                for k, v in attrs.items()
-                if k in default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
-            }
+            template_variables = default_formatter.extract_template_variables(DEFAULT_PROMPT_TEMPLATE)
+            prompt_variables = {k: v for k, v in attrs.items() if k in template_variables}
            if "instruction" in prompt_variables:
                raise RuntimeError(
                    "'instruction' should be passed as the first argument "
                    "instead of kwargs when 'use_default_prompt_template=True'"
                )
-            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            try:
+                prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            except KeyError as e:
+                raise RuntimeError(
+                    f"Missing variable '{e.args[0]}' (required: {template_variables}) in the prompt template. "
+                    "Use 'use_default_prompt_template=False' to disable the default prompt template."
+                )
        else:
            prompt_text = prompt

+        if decoder_start_token_id is None:
+            decoder_start_token_id = 0
+
        generation_config = {
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_k": top_k,
            "top_p": top_p,
            "repetition_penalty": repetition_penalty,
+            "decoder_start_token_id": decoder_start_token_id,
        }
        return prompt_text, generation_config, {}

@@ -73,11 +81,15 @@ class FlaxFlanT5(openllm.LLM):
        return generation_result[0]

    def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
+        # XXX: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main
+        # as it is required for encoder-decoder generation.
+        decoder_start_token_id = attrs.pop("decoder_start_token_id", 0)
        input_ids = self.tokenizer(prompt, return_tensors="np")["input_ids"]
        result_tensor = self.model.generate(
            input_ids,
            do_sample=True,
            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+            decoder_start_token_id=decoder_start_token_id,
        )
        return self.tokenizer.batch_decode(
            result_tensor.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True
--- a/src/openllm/models/stablelm/configuration_stablelm.py
+++ b/src/openllm/models/stablelm/configuration_stablelm.py
@@ -65,6 +65,4 @@ SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM will refuse to participate in anything that could harm a human.
 """  # noqa

-DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>""".format(
-    system_prompt=SYSTEM_PROMPT, instruction="{instruction}"
-)
+DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
--- a/src/openllm/models/stablelm/modeling_stablelm.py
+++ b/src/openllm/models/stablelm/modeling_stablelm.py
@@ -21,7 +21,7 @@ from transformers import StoppingCriteria, StoppingCriteriaList
 import openllm

 from ..._prompt import default_formatter
-from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE
+from .configuration_stablelm import DEFAULT_PROMPT_TEMPLATE, SYSTEM_PROMPT


 class StopOnTokens(StoppingCriteria):
@@ -81,7 +81,8 @@ class StableLM(openllm.LLM):
                    "'instruction' should be passed as the first argument "
                    "instead of kwargs when 'use_default_prompt_template=True'"
                )
-            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, **prompt_variables)
+            system_prompt = prompt_variables.pop("system_prompt", SYSTEM_PROMPT)
+            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt, system_prompt=system_prompt)
        else:
            prompt_text = prompt

--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -20,8 +20,6 @@ import bentoml

 import openllm

-from .configuration_starcoder import DEFAULT_PROMPT_TEMPLATE
-
 if t.TYPE_CHECKING:
    import torch
    import transformers
@@ -103,7 +101,6 @@ class StarCoder(openllm.LLM):
        top_p: float | None = None,
        max_new_tokens: int | None = None,
        repetition_penalty: float | None = None,
-        use_default_prompt_template: bool = True,
        **attrs: t.Any,
    ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
        fim_mode = FIM_INDICATOR in prompt
@@ -129,9 +126,6 @@ class StarCoder(openllm.LLM):
            **attrs,
        }

-        if use_default_prompt_template:
-            prompt_text = DEFAULT_PROMPT_TEMPLATE.format(instruction=prompt_text)
-
        return prompt_text, generation_config, {}

    def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str:
--- a/src/openllm/utils/init.py
+++ b/src/openllm/utils/init.py
@@ -34,6 +34,7 @@ from bentoml._internal.types import LazyType as LazyType
 from bentoml._internal.utils import LazyLoader as LazyLoader
 from bentoml._internal.utils import bentoml_cattr as bentoml_cattr
 from bentoml._internal.utils import copy_file_to_fs_folder as copy_file_to_fs_folder
+from bentoml._internal.utils import first_not_none as first_not_none
 from bentoml._internal.utils import pkg as pkg
 from bentoml._internal.utils import reserve_free_port as reserve_free_port
 from bentoml._internal.utils import resolve_user_filepath as resolve_user_filepath
--- a/src/openllm_client/runtimes/base.py
+++ b/src/openllm_client/runtimes/base.py
@@ -161,7 +161,11 @@ class BaseAsyncClient(ClientMixin):
        ...

    async def query(self, prompt: str, **attrs: t.Any) -> dict[str, t.Any] | str:
-        return_raw_response, prompt, generate_kwargs, postprocess_kwargs = self.prepare(prompt, **attrs)
+        # NOTE: We set use_default_prompt_template to False for now.
+        use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
+        return_raw_response, prompt, generate_kwargs, postprocess_kwargs = self.prepare(
+            prompt, use_default_prompt_template=use_default_prompt_template, **attrs
+        )
        inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
        res = await self.acall("generate", inputs)
        r = openllm.GenerationOutput(**res)