mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-23 00:17:28 -04:00
fix(loading): make sure not to load to cuda with kbit quantisation
Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
|
||||
HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build -w -C--global-option=--verbose
|
||||
hatch clean
|
||||
|
||||
@@ -840,11 +840,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
|
||||
if self.__llm_model__ is None:
|
||||
model = self.load_model(*self._model_decls, **self._model_attrs)
|
||||
# If OOM, then it is probably you don't have enough VRAM to run this model.
|
||||
if self.__llm_implementation__ == "pt" and is_torch_available() and torch.cuda.is_available() and torch.cuda.device_count() == 1:
|
||||
try:
|
||||
model = model.to("cuda")
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
|
||||
if self.__llm_implementation__ == "pt" and is_torch_available():
|
||||
loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_quantized", False)
|
||||
if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
|
||||
try:
|
||||
model = model.to("cuda")
|
||||
except Exception as err:
|
||||
raise OpenLLMException(f"Failed to load {self} into GPU: {err}\nTip: If you run into OOM issue, maybe try different offload strategy. See https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/quantization#offload-between-cpu-and-gpu for more information.") from err
|
||||
self.__llm_model__ = model
|
||||
return self.__llm_model__
|
||||
|
||||
|
||||
@@ -19,6 +19,8 @@ import typing as t
|
||||
import attr
|
||||
import inflection
|
||||
|
||||
import openllm
|
||||
|
||||
from ._configuration import GenerationConfig
|
||||
from ._configuration import LLMConfig
|
||||
from .utils import bentoml_cattr
|
||||
@@ -30,19 +32,18 @@ if t.TYPE_CHECKING:
|
||||
class GenerationInput:
|
||||
prompt: str
|
||||
llm_config: LLMConfig
|
||||
def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True)}
|
||||
adapter_name: str | None = attr.field(default=None)
|
||||
def model_dump(self) -> dict[str, t.Any]: return {"prompt": self.prompt, "llm_config": self.llm_config.model_dump(flatten=True), "adapter_name": self.adapter_name}
|
||||
@staticmethod
|
||||
def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig:
|
||||
if isinstance(data, LLMConfig): return data
|
||||
elif isinstance(data, dict):
|
||||
else:
|
||||
if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
|
||||
return cls(**data)
|
||||
else: raise RuntimeError(f"Type {type(data)} is not yet supported.")
|
||||
@classmethod
|
||||
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
|
||||
from .models.auto import AutoConfig
|
||||
llm_config = AutoConfig.for_model(model_name, **attrs)
|
||||
return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__))})
|
||||
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
|
||||
@classmethod
|
||||
def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
|
||||
|
||||
@attr.frozen(slots=True)
|
||||
class GenerationOutput:
|
||||
|
||||
@@ -40,9 +40,9 @@ svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[r
|
||||
@svc.api(route="/v1/generate", input=bentoml.io.JSON.from_sample({"prompt": "", "llm_config": llm_config.model_dump(flatten=True)}), # type: ignore[arg-type] # XXX: remove once JSON supports Attrs class
|
||||
output=bentoml.io.JSON.from_sample({"responses": [], "configuration": llm_config.model_dump(flatten=True)}))
|
||||
async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput:
|
||||
qa_inputs = openllm.GenerationInput.for_model(model)(**input_dict)
|
||||
qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict)
|
||||
config = qa_inputs.llm_config.model_dump()
|
||||
responses = await runner.generate.async_run(qa_inputs.prompt, **config)
|
||||
responses = await runner.generate.async_run(qa_inputs.prompt, **{"adapter_name": qa_inputs.adapter_name, **config})
|
||||
return openllm.GenerationOutput(responses=responses, configuration=config)
|
||||
|
||||
@svc.api(
|
||||
|
||||
Reference in New Issue
Block a user