From 14d702a34f6f71bad3a6715cddb4f9a779b03d12 Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 6 Jun 2023 22:21:40 +0000
Subject: [PATCH] fix(dolly-v2): using pipeline for latest implementation

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_configuration.py                 |   8 +-
 .../models/dolly_v2/configuration_dolly_v2.py |   3 +-
 .../models/dolly_v2/modeling_dolly_v2.py      | 159 +++++-------------
 3 files changed, 49 insertions(+), 121 deletions(-)

diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
index 7caebc28..c1278817 100644
--- a/src/openllm/_configuration.py
+++ b/src/openllm/_configuration.py
@@ -741,10 +741,15 @@ class LLMConfig:
 
         base_attrs, base_attr_map = _collect_base_attrs(cls, {a.name for a in own_attrs})
 
+        # __openllm_attrs__ is a tracking tuple[attr.Attribute[t.Any]]
+        # that we construct ourself.
+        cls.__openllm_attrs__ = tuple(a.name for a in own_attrs)
+
         # NOTE: Enable some default attributes that can be shared across all LLMConfig
         base_attrs = [
             attr.Attribute.from_counting_attr(k, cls.Field(default, env=field_env_key(k), description=docs), hints)
             for k, default, docs, hints in DEFAULT_LLMCONFIG_ATTRS
+            if k not in cls.__openllm_attrs__
         ] + base_attrs
         attrs: list[attr.Attribute[t.Any]] = own_attrs + base_attrs
 
@@ -776,9 +781,6 @@ class LLMConfig:
         _has_pre_init = bool(getattr(cls, "__attrs_pre_init__", False))
         _has_post_init = bool(getattr(cls, "__attrs_post_init__", False))
 
-        # __openllm_attrs__ is a tracking tuple[attr.Attribute[t.Any]]
-        # that we construct ourself.
-        cls.__openllm_attrs__ = tuple(a.name for a in attrs)
         AttrsTuple = _make_attr_tuple_class(cls.__name__, cls.__openllm_attrs__)
         # NOTE: generate a __attrs_init__ for the subclass
         cls.__attrs_init__ = _add_method_dunders(
diff --git a/src/openllm/models/dolly_v2/configuration_dolly_v2.py b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
index b6a163ab..0850b9ef 100644
--- a/src/openllm/models/dolly_v2/configuration_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/configuration_dolly_v2.py
@@ -20,7 +20,7 @@ from __future__ import annotations
 import openllm
 
 
-class DollyV2Config(openllm.LLMConfig, default_timeout=3600000):
+class DollyV2Config(openllm.LLMConfig, default_timeout=3600000, trust_remote_code=True):
     """Databricks’ Dolly is an instruction-following large language model trained on the Databricks
     machine learning platform that is licensed for commercial use.
 
@@ -37,6 +37,7 @@ class DollyV2Config(openllm.LLMConfig, default_timeout=3600000):
     return_full_text: bool = openllm.LLMConfig.Field(
         False, description="Whether to return the full prompt to the users."
     )
+    use_default_prompt_template: bool = False
 
     class GenerationConfig:
         temperature: float = 0.9
diff --git a/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
index feb6e878..e242d5cc 100644
--- a/src/openllm/models/dolly_v2/modeling_dolly_v2.py
+++ b/src/openllm/models/dolly_v2/modeling_dolly_v2.py
@@ -13,13 +13,16 @@
 # limitations under the License.
 from __future__ import annotations
 
+import importlib
 import logging
-import re
 import typing as t
 
+import bentoml
+import transformers
+
 import openllm
 
-from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY
+from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
 
 if t.TYPE_CHECKING:
     import torch
@@ -58,16 +61,36 @@ class DollyV2(openllm.LLM):
 
     default_model = "databricks/dolly-v2-3b"
 
+    load_in_mha = False  # NOTE: disable bettertransformer for dolly
+
     variants = ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
 
-    import_kwargs = {
-        "device_map": "auto",
-        "torch_dtype": torch.bfloat16,
-        "_tokenizer_padding_size": "left",
-    }
+    import_kwargs = {"device_map": "auto", "torch_dtype": torch.bfloat16, "_tokenizer_padding_side": "left"}
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+    def import_model(
+        self, pretrained: str, tag: bentoml.Tag, *model_args: t.Any, tokenizer_kwds: dict[str, t.Any], **attrs: t.Any
+    ) -> bentoml.Model:
+        trust_remote_code = attrs.pop("trust_remote_code", True)
+        torch_dtype = attrs.pop("torch_dtype", torch.bfloat16)
+        device_map = attrs.pop("device_map", "auto")
+
+        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained, **tokenizer_kwds)
+        pipeline = transformers.pipeline(
+            model=pretrained,
+            tokenizer=tokenizer,
+            trust_remote_code=trust_remote_code,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+        )
+        return bentoml.transformers.save_model(
+            tag,
+            pipeline,
+            custom_objects={"tokenizer": tokenizer},
+            external_modules=[importlib.import_module(pipeline.__module__)],
+        )
+
     def sanitize_parameters(
         self,
         prompt: str,
@@ -75,7 +98,7 @@ class DollyV2(openllm.LLM):
         temperature: float | None = None,
         top_k: int | None = None,
         top_p: float | None = None,
-        use_default_prompt_template: bool = True,
+        use_default_prompt_template: bool = False,
         **attrs: t.Any,
     ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
         if use_default_prompt_template:
@@ -95,117 +118,19 @@ class DollyV2(openllm.LLM):
 
         return prompt_text, generate_kwargs, {}
 
-    def postprocess_generate(
-        self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any
-    ) -> str:
-        return generation_result[0]["generated_text"]
+    def postprocess_generate(self, prompt: str, generation_result: str, **_: t.Any) -> str:
+        return generation_result
 
     @torch.inference_mode()
-    def generate(self, prompt: str, **attrs: t.Any):
-        """This is a implementation of InstructionTextGenerationPipeline from databricks."""
-        tokenizer_response_key = next(
-            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
-        )
-        response_key_token_id = None
-        end_key_token_id = None
-        eos_token_id = None
-
+    def generate(self, prompt: str, **attrs: t.Any) -> str:
+        self.model.tokenizer = self.tokenizer
         llm_config: openllm.DollyV2Config = self.config.model_construct_env(**attrs)
+        decoded = self.model(prompt, do_sample=True, generation_config=llm_config.to_generation_config())
 
-        if tokenizer_response_key:
-            try:
-                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
-                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)
+        # If the full text is requested, then append the decoded text to the original instruction.
+        # This technically isn't the full text, as we format the instruction in the prompt the model has been
+        # trained on, but to the client it will appear to be the full text.
+        if llm_config.return_full_text:
+            decoded = f"{DEFAULT_PROMPT_TEMPLATE.format(prompt)}\n{decoded}"
 
-                # Ensure generation stops once it generates "### End"
-                eos_token_id = end_key_token_id
-            except ValueError:
-                pass
-
-        inputs = self.tokenizer(prompt, return_tensors="pt")
-
-        input_ids = inputs["input_ids"]
-        attention_mask = inputs.get("attention_mask", None)
-
-        if input_ids.shape[1] == 0:
-            input_ids = None
-            attention_mask = None
-            in_b = 1
-        else:
-            in_b = input_ids.shape[0]
-
-        with torch.device(self.device):
-            generated_sequence = self.model.generate(
-                input_ids=input_ids.to(self.device),
-                attention_mask=attention_mask.to(self.device) if attention_mask is not None else None,
-                pad_token_id=self.tokenizer.pad_token_id,
-                do_sample=True,
-                eos_token_id=eos_token_id,
-                generation_config=llm_config.to_generation_config(),
-            )
-
-        out_b = generated_sequence.shape[0]
-
-        generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])[0]
-        if torch.cuda.is_available():
-            generated_sequence = generated_sequence.cpu()
-
-        generated_sequence: list[list[int]] = generated_sequence.numpy().tolist()
-        records: list[dict[str, t.Any]] = []
-        for sequence in generated_sequence:
-            # The response will be set to this variable if we can identify it.
-            decoded = None
-
-            # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
-            if response_key_token_id and end_key_token_id:
-                # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
-                # prompt, we should definitely find it.  We will return the tokens found after this token.
-                try:
-                    response_pos = sequence.index(response_key_token_id)
-                except ValueError:
-                    logger.warning(f"Could not find response key {response_key_token_id} in: {sequence}")
-                    response_pos = None
-
-                if response_pos:
-                    # Next find where "### End" is located.  The model has been trained to end its responses with this
-                    # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
-                    # this token, as the response could be truncated.  If we don't find it then just return everything
-                    # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
-                    try:
-                        end_pos = sequence.index(end_key_token_id)
-                    except ValueError:
-                        end_pos = None
-
-                    decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
-
-            if not decoded:
-                # Otherwise we'll decode everything and use a regex to find the response and end.
-
-                fully_decoded = self.tokenizer.decode(sequence)
-
-                # The response appears after "### Response:".  The model has been trained to append "### End" at the
-                # end.
-                m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)
-
-                if m:
-                    decoded = m.group(1).strip()
-                else:
-                    # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
-                    # return everything after "### Response:".
-                    m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
-                    if m:
-                        decoded = m.group(1).strip()
-                    else:
-                        logger.warning(f"Failed to find response in:\n{fully_decoded}")
-
-            # If the full text is requested, then append the decoded text to the original instruction.
-            # This technically isn't the full text, as we format the instruction in the prompt the model has been
-            # trained on, but to the client it will appear to be the full text.
-            if llm_config.return_full_text:
-                decoded = f"{prompt}\n{decoded}"
-
-            rec = {"generated_text": decoded}
-
-            records.append(rec)
-
-        return records
+        return decoded