From deaee67b47830a67340c5f91e22f84c48f484f50 Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Wed, 9 Aug 2023 01:42:11 +0000
Subject: [PATCH] fix(loading): make sure to cast the model to cuda if PyTorch

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 src/openllm/_llm.py                                | 11 +++++++++--
 src/openllm/models/gpt_neox/modeling_gpt_neox.py   |  1 -
 src/openllm/models/opt/modeling_opt.py             |  9 +--------
 src/openllm/serialisation/transformers/__init__.py |  2 --
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 0908f06a..a3745178 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -861,10 +861,17 @@ class LLM(LLMInterface[M, T], ReprMixin):
   @property
   def model(self) -> M:
     # Run check for GPU
-    if DEBUG: traceback.print_stack()
     if self.config["requires_gpu"] and device_count() < 1: raise GpuNotAvailableError(f"{self} only supports running with GPU (None available).") from None
     # NOTE: the signature of load_model here is the wrapper under _wrapped_load_model
-    if self.__llm_model__ is None: self.__llm_model__ = self.load_model(*self._model_decls, **self._model_attrs)
+    if self.__llm_model__ is None:
+      model = self.load_model(*self._model_decls, **self._model_attrs)
+      # If OOM, then it is probably you don't have enough VRAM to run this model.
+      if self.__llm_implementation__ == "pt" and is_torch_available() and torch.cuda.is_available() and torch.cuda.device_count() == 1:
+        try:
+          model = model.to("cuda")
+        except Exception as err:
+          raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
+      self.__llm_model__ = model
     return self.__llm_model__
 
   @property
diff --git a/src/openllm/models/gpt_neox/modeling_gpt_neox.py b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
index a1b90164..ab3a6474 100644
--- a/src/openllm/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/openllm/models/gpt_neox/modeling_gpt_neox.py
@@ -37,7 +37,6 @@ class GPTNeoX(openllm.LLM["transformers.GPTNeoXForCausalLM", "transformers.GPTNe
   def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.GPTNeoXForCausalLM:
     model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs)
     if self.config.use_half_precision: model.half()
-    if torch.cuda.is_available() and torch.cuda.device_count() == 1: model = model.to("cuda")
     return model
 
   def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
diff --git a/src/openllm/models/opt/modeling_opt.py b/src/openllm/models/opt/modeling_opt.py
index c1d544f6..ba5731c2 100644
--- a/src/openllm/models/opt/modeling_opt.py
+++ b/src/openllm/models/opt/modeling_opt.py
@@ -26,16 +26,9 @@ logger = logging.getLogger(__name__)
 class OPT(openllm.LLM["transformers.OPTForCausalLM", "transformers.GPT2Tokenizer"]):
   __openllm_internal__ = True
 
-  def llm_post_init(self):
-    self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-
   @property
   def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
-    return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
-
-  def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.OPTForCausalLM:
-    torch_dtype = attrs.pop("torch_dtype", self.dtype)
-    return transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, torch_dtype=torch_dtype, **attrs)
+    return {"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32}, {}
 
   def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
     return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
diff --git a/src/openllm/serialisation/transformers/__init__.py b/src/openllm/serialisation/transformers/__init__.py
index 87207ce0..3901726c 100644
--- a/src/openllm/serialisation/transformers/__init__.py
+++ b/src/openllm/serialisation/transformers/__init__.py
@@ -179,8 +179,6 @@ def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
 
   device_map = attrs.pop("device_map", "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
   model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.__llm_trust_remote_code__, device_map=device_map, **hub_attrs, **attrs).eval()
-  # If OOM, then it is probably you don't have enough VRAM to run this model.
-  if torch.cuda.is_available() and torch.cuda.device_count() == 1: model = model.cuda()
   # BetterTransformer is currently only supported on PyTorch.
   if llm.bettertransformer and isinstance(model, transformers.PreTrainedModel): model = model.to_bettertransformer()
   if llm.__llm_implementation__ in {"pt", "vllm"}: check_unintialised_params(model)