From 689b83bbe3bcff7da9235c0e75729e9e35ca1941 Mon Sep 17 00:00:00 2001 From: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> Date: Thu, 10 Aug 2023 19:39:01 +0000 Subject: [PATCH] fix(loading): make sure not to load to cuda with kbit quantisation Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com> --- compile.sh | 1 + src/openllm/_llm.py | 12 +++++++----- src/openllm/_schema.py | 15 ++++++++------- src/openllm/_service.py | 4 ++-- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/compile.sh b/compile.sh index f3a29d5a..7ee934bf 100644 --- a/compile.sh +++ b/compile.sh @@ -1,3 +1,4 @@ #!/usr/bin/env bash SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build -w -C--global-option=--verbose +hatch clean diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py index 30827b29..844c9f4c 100644 --- a/src/openllm/_llm.py +++ b/src/openllm/_llm.py @@ -840,11 +840,13 @@ class LLM(LLMInterface[M, T], ReprMixin): if self.__llm_model__ is None: model = self.load_model(*self._model_decls, **self._model_attrs) # If OOM, then it is probably you don't have enough VRAM to run this model. - if self.__llm_implementation__ == "pt" and is_torch_available() and torch.cuda.is_available() and torch.cuda.device_count() == 1: - try: - model = model.to("cuda") - except Exception as err: - raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err + if self.__llm_implementation__ == "pt" and is_torch_available(): + loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_quantized", False) + if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: + try: + model = model.to("cuda") + except Exception as err: + raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err self.__llm_model__ = model return self.__llm_model__ diff --git a/src/openllm/_schema.py b/src/openllm/_schema.py index 854934c0..35e52183 100644 --- a/src/openllm/_schema.py +++ b/src/openllm/_schema.py @@ -19,6 +19,8 @@ import typing as t import attr import inflection +import openllm + from ._configuration import GenerationConfig from ._configuration import LLMConfig from .utils import bentoml_cattr @@ -30,19 +32,18 @@ if t.TYPE_CHECKING: class GenerationInput: prompt: str llm_config: LLMConfig - def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True)} + adapter_name: str | None = attr.field(default=None) + def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name} @staticmethod def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig: if isinstance(data, LLMConfig): return data - elif isinstance(data, dict): + else: if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.") return cls(**data) - else: raise RuntimeError(f"Type {type(data)} is not yet supported.") @classmethod - def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: - from .models.auto import AutoConfig - llm_config = AutoConfig.for_model(model_name, **attrs) - return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__))}) + def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs)) + @classmethod + def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)}) @attr.frozen(slots=True) class GenerationOutput: diff --git a/src/openllm/_service.py b/src/openllm/_service.py index 84d8774e..529b019f 100644 --- a/src/openllm/_service.py +++ b/src/openllm/_service.py @@ -40,9 +40,9 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[r @svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), # type: ignore[arg-type] # XXX: remove once JSON supports Attrs class output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: - qa_inputs = openllm.GenerationInput.for_model(model)(**input_dict) + qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) config = qa_inputs.llm_config.model_dump() - responses = await runner.generate.async_run(qa_inputs.prompt, **config) + responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config}) return openllm.GenerationOutput(responses=responses, configuration=config) @svc.api(