fix(loading): make sure not to load to cuda with kbit quantisation

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
2026-06-11 18:09:52 -04:00 · 2023-08-10 19:39:01 +00:00
parent 7c3646bb89
commit 689b83bbe3
4 changed files with 18 additions and 14 deletions
--- a/compile.sh
+++ b/compile.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
 HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build -w -C--global-option=--verbose
+hatch clean
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -840,11 +840,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
    if self.__llm_model__ is None:
      model = self.load_model(*self._model_decls, **self._model_attrs)
      # If OOM, then it is probably you don't have enough VRAM to run this model.
-      if self.__llm_implementation__ == "pt" and is_torch_available() and torch.cuda.is_available() and torch.cuda.device_count() == 1:
-        try:
-          model = model.to("cuda")
-        except Exception as err:
-          raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
+      if self.__llm_implementation__ == "pt" and is_torch_available():
+        loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_quantized", False)
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
+          try:
+            model = model.to("cuda")
+          except Exception as err:
+            raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
      self.__llm_model__ = model
    return self.__llm_model__

--- a/src/openllm/_schema.py
+++ b/src/openllm/_schema.py
@@ -19,6 +19,8 @@ import typing as t
 import attr
 import inflection

+import openllm
+
 from ._configuration import GenerationConfig
 from ._configuration import LLMConfig
 from .utils import bentoml_cattr
@@ -30,19 +32,18 @@ if t.TYPE_CHECKING:
 class GenerationInput:
  prompt: str
  llm_config: LLMConfig
-  def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True)}
+  adapter_name: str | None = attr.field(default=None)
+  def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name}
  @staticmethod
  def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig:
    if isinstance(data, LLMConfig): return data
-    elif isinstance(data, dict):
+    else:
      if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
      return cls(**data)
-    else: raise RuntimeError(f"Type {type(data)} is not yet supported.")
  @classmethod
-  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
-    from .models.auto import AutoConfig
-    llm_config = AutoConfig.for_model(model_name, **attrs)
-    return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__))})
+  def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
+  @classmethod
+  def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})

@attr.frozen(slots=True)
 class GenerationOutput:
--- a/src/openllm/_service.py
+++ b/src/openllm/_service.py
@@ -40,9 +40,9 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[r
@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}),  # type: ignore[arg-type] # XXX: remove once JSON supports Attrs class
          output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
 async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
-  qa_inputs = openllm.GenerationInput.for_model(model)(**input_dict)
+  qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
  config = qa_inputs.llm_config.model_dump()
-  responses = await runner.generate.async_run(qa_inputs.prompt, **config)
+  responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
  return openllm.GenerationOutput(responses=responses, configuration=config)

@svc.api(