From 204a7ab7c9ea4f8fa565d17de817c734f1ca239b Mon Sep 17 00:00:00 2001 From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Date: Sat, 10 Jun 2023 23:17:42 +0000 Subject: [PATCH] revert(starcoder): quant 8 revert 2348946ada21a913593d14b7e5f41bbee8f6df89 Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --- .../starcoder/configuration_starcoder.py | 6 ++-- .../models/starcoder/modeling_starcoder.py | 35 +++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/src/openllm/models/starcoder/configuration_starcoder.py b/src/openllm/models/starcoder/configuration_starcoder.py index 2dc9b307..e27264ed 100644 --- a/src/openllm/models/starcoder/configuration_starcoder.py +++ b/src/openllm/models/starcoder/configuration_starcoder.py @@ -34,11 +34,13 @@ class StarCoderConfig( """ class GenerationConfig: - temperature: float = 0.9 + temperature: float = 0.2 max_new_tokens: int = 256 + min_new_tokens: int = 32 + top_k: float = 50 top_p: float = 0.95 pad_token_id: int = 49152 - repetition_penalty: float = 1.0 + repetition_penalty: float = 1.2 START_STARCODER_COMMAND_DOCSTRING = """\ diff --git a/src/openllm/models/starcoder/modeling_starcoder.py b/src/openllm/models/starcoder/modeling_starcoder.py index 00f4c639..7959ec19 100644 --- a/src/openllm/models/starcoder/modeling_starcoder.py +++ b/src/openllm/models/starcoder/modeling_starcoder.py @@ -44,7 +44,14 @@ class StarCoder(openllm.LLM): model_ids = ["bigcode/starcoder", "bigcode/starcoderbase"] - device = torch.device("cuda") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + import_kwargs = { + "_tokenizer_padding_side": "left", + "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, + "load_in_8bit": True if torch.cuda.device_count() > 1 else False, + "torch_dtype": torch.float16, + } def import_model( self, @@ -54,6 +61,10 @@ class StarCoder(openllm.LLM): tokenizer_kwds: dict[str, t.Any], **attrs: t.Any, ) -> bentoml.Model: + torch_dtype = attrs.pop("torch_dtype", torch.float16) + load_in_8bit = attrs.pop("load_in_8bit", True) + device_map = attrs.pop("device_map", "auto") + tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, **tokenizer_kwds) tokenizer.add_special_tokens( { @@ -62,8 +73,9 @@ class StarCoder(openllm.LLM): } ) - model = transformers.AutoModelForCausalLM.from_pretrained(model_id, **attrs) - + model = transformers.AutoModelForCausalLM.from_pretrained( + model_id, torch_dtype=torch_dtype, load_in_8bit=load_in_8bit, device_map=device_map, **attrs + ) try: return bentoml.transformers.save_model(tag, model, custom_objects={"tokenizer": tokenizer}) finally: @@ -113,12 +125,15 @@ class StarCoder(openllm.LLM): @torch.inference_mode() def generate(self, prompt: str, **attrs: t.Any) -> list[str]: inputs = t.cast("torch.Tensor", self.tokenizer.encode(prompt, return_tensors="pt")).to(self.device) - with torch.device(self.device): - result_tensor = self.model.generate( - inputs, - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config(), - ) + result_tensor = self.model.generate( + inputs, + do_sample=True, + pad_token_id=self.tokenizer.eos_token_id, + # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + ) # TODO: We will probably want to return the tokenizer here so that we can manually process this # return (skip_special_tokens=False, clean_up_tokenization_spaces=False)) - return [self.tokenizer.decode(result_tensor[0])] + return self.tokenizer.batch_decode( + result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True + )