From 204a7ab7c9ea4f8fa565d17de817c734f1ca239b Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Sat, 10 Jun 2023 23:17:42 +0000
Subject: [PATCH] revert(starcoder): quant 8

revert 2348946ada21a913593d14b7e5f41bbee8f6df89

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 .../starcoder/configuration_starcoder.py      |  6 ++--
 .../models/starcoder/modeling_starcoder.py    | 35 +++++++++++++------
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/openllm/models/starcoder/configuration_starcoder.py b/src/openllm/models/starcoder/configuration_starcoder.py
index 2dc9b307..e27264ed 100644
--- a/src/openllm/models/starcoder/configuration_starcoder.py
+++ b/src/openllm/models/starcoder/configuration_starcoder.py
@@ -34,11 +34,13 @@ class StarCoderConfig(
     """
 
     class GenerationConfig:
-        temperature: float = 0.9
+        temperature: float = 0.2
         max_new_tokens: int = 256
+        min_new_tokens: int = 32
+        top_k: float = 50
         top_p: float = 0.95
         pad_token_id: int = 49152
-        repetition_penalty: float = 1.0
+        repetition_penalty: float = 1.2
 
 
 START_STARCODER_COMMAND_DOCSTRING = """\
diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py
index 00f4c639..7959ec19 100644
--- a/src/openllm/models/starcoder/modeling_starcoder.py
+++ b/src/openllm/models/starcoder/modeling_starcoder.py
@@ -44,7 +44,14 @@ class StarCoder(openllm.LLM):
 
     model_ids = ["bigcode/starcoder", "bigcode/starcoderbase"]
 
-    device = torch.device("cuda")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    import_kwargs = {
+        "_tokenizer_padding_side": "left",
+        "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
+        "load_in_8bit": True if torch.cuda.device_count() > 1 else False,
+        "torch_dtype": torch.float16,
+    }
 
     def import_model(
         self,
@@ -54,6 +61,10 @@ class StarCoder(openllm.LLM):
         tokenizer_kwds: dict[str, t.Any],
         **attrs: t.Any,
     ) -> bentoml.Model:
+        torch_dtype = attrs.pop("torch_dtype", torch.float16)
+        load_in_8bit = attrs.pop("load_in_8bit", True)
+        device_map = attrs.pop("device_map", "auto")
+
         tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds)
         tokenizer.add_special_tokens(
             {
@@ -62,8 +73,9 @@ class StarCoder(openllm.LLM):
             }
         )
 
-        model = transformers.AutoModelForCausalLM.from_pretrained(model_id, **attrs)
-
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch_dtype, load_in_8bit=load_in_8bit, device_map=device_map, **attrs
+        )
         try:
             return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer})
         finally:
@@ -113,12 +125,15 @@ class StarCoder(openllm.LLM):
     @torch.inference_mode()
     def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
         inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device)
-        with torch.device(self.device):
-            result_tensor = self.model.generate(
-                inputs,
-                do_sample=True,
-                generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
-            )
+        result_tensor = self.model.generate(
+            inputs,
+            do_sample=True,
+            pad_token_id=self.tokenizer.eos_token_id,
+            # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder
+            generation_config=self.config.model_construct_env(**attrs).to_generation_config(),
+        )
         # TODO: We will probably want to return the tokenizer here so that we can manually process this
         # return (skip_special_tokens=False, clean_up_tokenization_spaces=False))
-        return [self.tokenizer.decode(result_tensor[0])]
+        return self.tokenizer.batch_decode(
+            result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )