fix(loading): make sure not to load to cuda with kbit quantisation

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
aarnphm-ec2-dev
2023-08-10 19:39:01 +00:00
parent 7c3646bb89
commit 689b83bbe3
4 changed files with 18 additions and 14 deletions

View File

@@ -1,3 +1,4 @@
#!/usr/bin/env bash
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build -w -C--global-option=--verbose
hatch clean

View File

@@ -840,11 +840,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
if self.__llm_model__ is None:
model = self.load_model(*self._model_decls, **self._model_attrs)
# If OOM, then it is probably you don't have enough VRAM to run this model.
if self.__llm_implementation__ == "pt" and is_torch_available() and torch.cuda.is_available() and torch.cuda.device_count() == 1:
try:
model = model.to("cuda")
except Exception as err:
raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
if self.__llm_implementation__ == "pt" and is_torch_available():
loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_quantized", False)
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
try:
model = model.to("cuda")
except Exception as err:
raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
self.__llm_model__ = model
return self.__llm_model__

View File

@@ -19,6 +19,8 @@ import typing as t
import attr
import inflection
import openllm
from ._configuration import GenerationConfig
from ._configuration import LLMConfig
from .utils import bentoml_cattr
@@ -30,19 +32,18 @@ if t.TYPE_CHECKING:
class GenerationInput:
prompt: str
llm_config: LLMConfig
def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True)}
adapter_name: str | None = attr.field(default=None)
def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name}
@staticmethod
def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig:
if isinstance(data, LLMConfig): return data
elif isinstance(data, dict):
else:
if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
return cls(**data)
else: raise RuntimeError(f"Type {type(data)} is not yet supported.")
@classmethod
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
from .models.auto import AutoConfig
llm_config = AutoConfig.for_model(model_name, **attrs)
return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__))})
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
@classmethod
def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
@attr.frozen(slots=True)
class GenerationOutput:

View File

@@ -40,9 +40,9 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[r
@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), # type: ignore[arg-type] # XXX: remove once JSON supports Attrs class
output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
qa_inputs = openllm.GenerationInput.for_model(model)(**input_dict)
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
config = qa_inputs.llm_config.model_dump()
responses = await runner.generate.async_run(qa_inputs.prompt, **config)
responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
return openllm.GenerationOutput(responses=responses, configuration=config)
@svc.api(