From 689b83bbe3bcff7da9235c0e75729e9e35ca1941 Mon Sep 17 00:00:00 2001
From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
Date: Thu, 10 Aug 2023 19:39:01 +0000
Subject: [PATCH] fix(loading): make sure not to load to cuda with kbit
 quantisation

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
---
 compile.sh              |  1 +
 src/openllm/_llm.py     | 12 +++++++-----
 src/openllm/_schema.py  | 15 ++++++++-------
 src/openllm/_service.py |  4 ++--
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/compile.sh b/compile.sh
index f3a29d5a..7ee934bf 100644
--- a/compile.sh
+++ b/compile.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build -w -C--global-option=--verbose
+hatch clean
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 30827b29..844c9f4c 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -840,11 +840,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
     if self.__llm_model__ is None:
       model = self.load_model(*self._model_decls, **self._model_attrs)
       # If OOM, then it is probably you don't have enough VRAM to run this model.
-      if self.__llm_implementation__ == "pt" and is_torch_available() and torch.cuda.is_available() and torch.cuda.device_count() == 1:
-        try:
-          model = model.to("cuda")
-        except Exception as err:
-          raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
+      if self.__llm_implementation__ == "pt" and is_torch_available():
+        loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_quantized", False)
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
+          try:
+            model = model.to("cuda")
+          except Exception as err:
+            raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
       self.__llm_model__ = model
     return self.__llm_model__
 
diff --git a/src/openllm/_schema.py b/src/openllm/_schema.py
index 854934c0..35e52183 100644
--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -19,6 +19,8 @@ import typing as t
 import attr
 import inflection
 
+import openllm
+
 from ._configuration import GenerationConfig
 from ._configuration import LLMConfig
 from .utils import bentoml_cattr
@@ -30,19 +32,18 @@ if t.TYPE_CHECKING:
 class GenerationInput:
   prompt: str
   llm_config: LLMConfig
-  def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True)}
+  adapter_name: str | None = attr.field(default=None)
+  def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name}
   @staticmethod
   def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig:
     if isinstance(data, LLMConfig): return data
-    elif isinstance(data, dict):
+    else:
       if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
       return cls(**data)
-    else: raise RuntimeError(f"Type {type(data)} is not yet supported.")
   @classmethod
-  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
-    from .models.auto import AutoConfig
-    llm_config = AutoConfig.for_model(model_name, **attrs)
-    return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__))})
+  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
+  @classmethod
+  def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
 
 @attr.frozen(slots=True)
 class GenerationOutput:
diff --git a/src/openllm/_service.py b/src/openllm/_service.py
index 84d8774e..529b019f 100644
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -40,9 +40,9 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[r
 @svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}),  # type: ignore[arg-type] # XXX: remove once JSON supports Attrs class
           output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
-  qa_inputs = openllm.GenerationInput.for_model(model)(**input_dict)
+  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
   config = qa_inputs.llm_config.model_dump()
-  responses = await runner.generate.async_run(qa_inputs.prompt, **config)
+  responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
   return openllm.GenerationOutput(responses=responses, configuration=config)
 
 @svc.api(