fix(gptq): use upstream integration (#297)

* wip

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* feat: GPTQ transformers integration

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* fix: only load if variable is available and add changelog

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

* chore: remove boilerplate check

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: aarnphm-ec2-dev <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-09-04 14:05:50 -04:00
committed by GitHub
parent 3da869e728
commit 956b3a53bc
23 changed files with 197 additions and 248 deletions

View File

@@ -39,10 +39,11 @@ classifiers = [
]
dependencies = [
"bentoml[io]>=1.1.2",
"transformers[torch,tokenizers,accelerate]>=4.29.0",
"transformers[torch,tokenizers,accelerate]>=4.32.1",
"openllm-client",
"safetensors",
"optimum",
"optimum>=1.12.0",
"accelerate",
"ghapi",
"tabulate[widechars]>=0.9.0",
"click>=8.1.3",
@@ -99,13 +100,13 @@ all = ["openllm[full]"]
baichuan = ["cpm-kernels", "sentencepiece"]
chatglm = ["cpm-kernels", "sentencepiece"]
falcon = ["einops", "xformers"]
fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
fine-tune = ["peft>=0.5.0", "bitsandbytes", "datasets", "accelerate", "trl"]
flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
full = [
"openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
"openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,grpc,llama,mpt,openai,opt,playground,starcoder,vllm]",
]
ggml = ["ctransformers"]
gptq = ["auto-gptq[triton]"]
gptq = ["auto-gptq[triton]>=0.4.2", "optimum>=1.12.0"]
grpc = ["openllm-client[grpc]"]
llama = ["fairscale", "sentencepiece"]
mpt = ["triton", "einops"]
@@ -150,7 +151,7 @@ dependencies = [
# avoid https://github.com/pallets/click/issues/2558
"click==8.1.3",
"bentoml==1.1.2",
"transformers>=4.31.0",
"transformers>=4.32.1",
"pandas-stubs",
"types-psutil",
"types-tabulate",

View File

@@ -28,6 +28,7 @@ from openllm_core._typing_compat import AdaptersTuple
from openllm_core._typing_compat import AdapterType
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralQuantise
from openllm_core._typing_compat import LiteralString
from openllm_core._typing_compat import LLMRunnable
from openllm_core._typing_compat import LLMRunner
@@ -63,7 +64,6 @@ from .utils import infer_auto_class
if t.TYPE_CHECKING:
import auto_gptq as autogptq
import peft
import torch
import transformers
@@ -71,7 +71,6 @@ if t.TYPE_CHECKING:
from openllm_core._configuration import PeftType
from openllm_core.utils.representation import ReprArgs
else:
autogptq = LazyLoader('autogptq', globals(), 'auto_gptq')
transformers = LazyLoader('transformers', globals(), 'transformers')
torch = LazyLoader('torch', globals(), 'torch')
peft = LazyLoader('peft', globals(), 'peft')
@@ -80,6 +79,8 @@ ResolvedAdaptersMapping = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConf
logger = logging.getLogger(__name__)
_object_setattr = object.__setattr__
def normalise_model_name(name: str) -> str:
if validate_is_path(name): return os.path.basename(resolve_filepath(name))
name = name.replace('/', '--')
@@ -280,7 +281,8 @@ class LLM(LLMInterface[M, T], ReprMixin):
def __attrs_init__(self,
config: LLMConfig,
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]],
quantize: t.Optional[LiteralQuantise],
quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig]],
model_id: str,
model_decls: TupleAny,
model_attrs: DictStrAny,
@@ -288,17 +290,16 @@ class LLM(LLMInterface[M, T], ReprMixin):
tag: bentoml.Tag,
adapters_mapping: t.Optional[AdaptersMapping],
model_version: t.Optional[str],
quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']],
serialisation_format: t.Literal['safetensors', 'legacy'],
serialisation: t.Literal['safetensors', 'legacy'],
_local: bool,
**attrs: t.Any) -> None:
'''Generated __attrs_init__ for openllm.LLM.'''
config: LLMConfig
'''The config instance to use for this LLM. This will be created based on config_class and available when initialising the LLM.'''
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None
'''Quantisation config for quantised model on the fly.'''
_quantize: LiteralQuantise | None
_model_id: str
_model_decls: TupleAny
_model_attrs: DictStrAny
@@ -306,8 +307,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
_tag: bentoml.Tag
_adapters_mapping: AdaptersMapping | None
_model_version: str
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None
_serialisation_format: t.Literal['safetensors', 'legacy']
_serialisation: t.Literal['safetensors', 'legacy']
_local: bool
def __init_subclass__(cls: type[LLM[M, T]]) -> None:
@@ -376,11 +376,11 @@ class LLM(LLMInterface[M, T], ReprMixin):
model_version: str | None = None,
llm_config: LLMConfig | None = None,
*args: t.Any,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
quantize: LiteralQuantise | None = None,
adapter_id: str | None = None,
adapter_name: str | None = None,
adapter_map: dict[str, str | None] | None = None,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
**attrs: t.Any) -> LLM[M, T]:
'''Instantiate a pretrained LLM.
@@ -403,9 +403,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
model = openllm.AutoLLM.from_pretrained("opt", quantize='int8', llm_int8_enable_fp32_cpu_offload=False)
```
For all GPTQ-related options, it accepts all value prefixed with `gptq_*`. The parsed value then could be parsed
to ``auto_gptq.BaseQuantizeConfig``.
### Adapter options:
> This is used in conjunction with the fine-tuning features
@@ -427,7 +424,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
will use `config_class` to construct default configuration.
quantize: The quantization to use for this LLM. Defaults to None. Possible values
include int8, int4 and gptq.
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `autogtpq.BaseQuantizeConfig`) to use. Note that this is mutually exclusive with `quantize`
quantization_config: The quantization config (`transformers.BitsAndBytesConfig` | `transformers.GPTQConfig`) to use. Note that this is mutually exclusive with `quantize`
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
Default behaviour is similar to ``safe_serialization=False``.
adapter_id: The [LoRA](https://arxiv.org/pdf/2106.09685.pdf) pretrained id or local path to use for this LLM. Defaults to None.
@@ -440,13 +437,15 @@ class LLM(LLMInterface[M, T], ReprMixin):
_local = False
_model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__)
if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True
quantize = first_not_none(quantize, t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
quantize = first_not_none(quantize, t.cast(t.Optional[LiteralQuantise], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None)
# quantization setup
if quantization_config and quantize:
raise ValueError("'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.")
if quantization_config is None and quantize is not None:
quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs)
# in case users input `tokenizer` to __init__, default to the _model_id
_gptq_tokenizer = attrs.pop('tokenizer', _model_id)
quantization_config, attrs = infer_quantisation_config(cls, quantize, tokenizer=_gptq_tokenizer, **attrs)
if quantize == 'gptq': serialisation = 'safetensors'
elif cls.__llm_backend__ == 'vllm': serialisation = 'legacy' # Currently working-in-progress
@@ -476,10 +475,10 @@ class LLM(LLMInterface[M, T], ReprMixin):
model_id=_model_id,
llm_config=llm_config,
quantization_config=quantization_config,
_quantize_method=quantize,
_quantize=quantize,
_model_version=_tag.version,
_tag=_tag,
_serialisation_format=serialisation,
_serialisation=serialisation,
_local=_local,
_adapters_mapping=resolve_peft_config_type(adapter_map) if adapter_map is not None else None,
**attrs)
@@ -534,12 +533,12 @@ class LLM(LLMInterface[M, T], ReprMixin):
*args: t.Any,
model_id: str,
llm_config: LLMConfig,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None,
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None,
_adapters_mapping: AdaptersMapping | None,
_tag: bentoml.Tag,
_quantize_method: t.Literal['int8', 'int4', 'gptq'] | None,
_quantize: LiteralQuantise | None,
_model_version: str,
_serialisation_format: t.Literal['safetensors', 'legacy'],
_serialisation: t.Literal['safetensors', 'legacy'],
_local: bool,
**attrs: t.Any,
):
@@ -641,6 +640,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
# NOTE: Save the args and kwargs for latter load
self.__attrs_init__(llm_config,
quantization_config,
_quantize,
model_id,
args, {
**model_kwds, **normalized_model_kwds
@@ -650,8 +650,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
_tag,
_adapters_mapping,
_model_version,
_quantize_method,
_serialisation_format,
_serialisation,
_local)
self.llm_post_init()
@@ -672,7 +671,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
@adapters_mapping.setter
def adapters_mapping(self, value: AdaptersMapping) -> None:
self._adapters_mapping = value
_object_setattr(self, '_adapters_mapping', value)
@property
def __repr_keys__(self) -> set[str]:
@@ -709,13 +708,13 @@ class LLM(LLMInterface[M, T], ReprMixin):
def tag(self) -> bentoml.Tag:
return self._tag
def ensure_model_id_exists(self) -> bentoml.Model:
def save_pretrained(self) -> bentoml.Model:
return openllm.import_model(self.config['start_name'],
model_id=self.model_id,
model_version=self._model_version,
backend=self.__llm_backend__,
quantize=self._quantize_method,
serialisation_format=self._serialisation_format)
quantize=self._quantize,
serialisation=self._serialisation)
@property
def _bentomodel(self) -> bentoml.Model:
@@ -1085,11 +1084,11 @@ def Runner(model_name: str,
model_id: str | None = ...,
model_version: str | None = ...,
llm_config: LLMConfig | None = ...,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = ...,
quantize: LiteralQuantise | None = ...,
adapter_id: str | None = ...,
adapter_name: str | None = ...,
adapter_map: dict[str, str | None] | None = ...,
quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None,
quantization_config: transformers.BitsAndBytesConfig | transformers.GPTQConfig | None = None,
serialisation: t.Literal['safetensors', 'legacy'] = ...,
**attrs: t.Any) -> LLMRunner[t.Any, t.Any]:
...
@@ -1270,7 +1269,7 @@ def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]:
'config': self.config,
'backend': self.__llm_backend__,
'peft_adapters': property(fget=available_adapters),
'download_model': self.ensure_model_id_exists,
'download_model': self.save_pretrained,
'__call__': _wrapped_generate_run,
'embed': _wrapped_embeddings_run,
'__module__': self.__module__,

View File

@@ -3,47 +3,68 @@ from __future__ import annotations
import logging
import typing as t
import torch
import transformers
from openllm_core._typing_compat import LiteralQuantise
from openllm_core._typing_compat import overload
from openllm_core.utils import LazyLoader
from openllm_core.utils import is_autogptq_available
from openllm_core.utils import is_bitsandbytes_available
from openllm_core.utils import is_transformers_supports_kbit
from openllm_core.utils import pkg
from openllm_core.utils import is_optimum_supports_gptq
if t.TYPE_CHECKING:
from openllm_core._typing_compat import DictStrAny
from ._llm import LLM
autogptq, torch, transformers = LazyLoader('autogptq', globals(), 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers')
logger = logging.getLogger(__name__)
QuantiseMode = t.Literal['int8', 'int4', 'gptq']
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['int8', 'int4'], **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig, DictStrAny]:
...
@overload
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal['gptq'], **attrs: t.Any) -> tuple[transformers.GPTQConfig, DictStrAny]:
...
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]:
def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: LiteralQuantise, **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | transformers.GPTQConfig, DictStrAny]:
# 8 bit configuration
int8_threshold = attrs.pop('llm_int8_threshhold', 6.0)
int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False)
int8_skip_modules: list[str] | None = attrs.pop('llm_int8_skip_modules', None)
int8_has_fp16_weight = attrs.pop('llm_int8_has_fp16_weight', False)
autogptq_attrs: DictStrAny = {
'bits': attrs.pop('gptq_bits', 4),
'group_size': attrs.pop('gptq_group_size', -1),
'damp_percent': attrs.pop('gptq_damp_percent', 0.01),
'desc_act': attrs.pop('gptq_desc_act', True),
'sym': attrs.pop('gptq_sym', True),
'true_sequential': attrs.pop('gptq_true_sequential', True),
}
def create_gptq_config() -> transformers.GPTQConfig:
gptq_bits = attrs.pop('bits', 4)
gptq_tokenizer = attrs.pop('tokenizer', None)
gptq_dataset = attrs.pop('dataset', 'c4')
gptq_group_size = attrs.pop('group_size', 128)
gptq_damp_percent = attrs.pop('damp_percent', 0.1)
gptq_desc_act = attrs.pop('desc_act', False)
gptq_sym = attrs.pop('sym', True)
gptq_true_sequential = attrs.pop('true_sequential', True)
gptq_use_cuda_fp16 = attrs.pop('use_cuda_fp16', True if torch.cuda.is_available() else False)
gptq_model_seqlen = attrs.pop('model_seqlen', None)
gptq_block_name_to_quantize = attrs.pop('block_name_to_quantize', None)
gptq_module_name_preceding_first_block = attrs.pop('module_name_preceding_first_block', None)
gptq_batch_size = attrs.pop('batch_size', 1)
gptq_pad_token_id = attrs.pop('pad_token_id', None)
gptq_disable_exllama = attrs.pop('disable_exllama', False)
return transformers.GPTQConfig(bits=gptq_bits,
tokenizer=gptq_tokenizer,
dataset=gptq_dataset,
group_size=gptq_group_size,
damp_percent=gptq_damp_percent,
desc_act=gptq_desc_act,
sym=gptq_sym,
true_sequential=gptq_true_sequential,
use_cuda_fp16=gptq_use_cuda_fp16,
model_seqlen=gptq_model_seqlen,
block_name_to_quantize=gptq_block_name_to_quantize,
module_name_preceding_first_block=gptq_module_name_preceding_first_block,
batch_size=gptq_batch_size,
pad_token_id=gptq_pad_token_id,
disable_exllama=gptq_disable_exllama)
def create_int8_config(int8_skip_modules: list[str] | None) -> transformers.BitsAndBytesConfig:
if int8_skip_modules is None: int8_skip_modules = []
@@ -69,24 +90,18 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMo
raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'")
if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules)
elif quantise == 'int4':
if is_transformers_supports_kbit():
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=int4_compute_dtype,
bnb_4bit_quant_type=int4_quant_type,
bnb_4bit_use_double_quant=int4_use_double_quant)
else:
logger.warning(
"'quantize' is set to int4, while the current transformers version %s does not support k-bit quantization. k-bit quantization is supported since transformers 4.30, therefore make sure to install the latest version of transformers either via PyPI or from git source: 'pip install git+https://github.com/huggingface/transformers'. Fallback to int8 quantisation.",
pkg.pkg_version_info('transformers'))
quantisation_config = create_int8_config(int8_skip_modules)
quantisation_config = transformers.BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=int4_compute_dtype,
bnb_4bit_quant_type=int4_quant_type,
bnb_4bit_use_double_quant=int4_use_double_quant)
elif quantise == 'gptq':
if not is_autogptq_available():
if not is_autogptq_available() or not is_optimum_supports_gptq():
logger.warning(
"'quantize=\"gptq\"' requires 'auto-gptq' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
"'quantize=\"gptq\"' requires 'auto-gptq' and 'optimum>=0.12' to be installed (not available with local environment). Make sure to have 'auto-gptq' available locally: 'pip install \"openllm[gptq]\"'. OpenLLM will fallback to int8 with bitsandbytes."
)
quantisation_config = create_int8_config(int8_skip_modules)
else:
quantisation_config = autogptq.BaseQuantizeConfig(**autogptq_attrs)
quantisation_config = create_gptq_config()
else:
raise ValueError(f"'quantize' must be one of ['int8', 'int4', 'gptq'], got {quantise} instead.")
return quantisation_config, attrs

View File

@@ -109,12 +109,11 @@ def metadata_v1(_: str) -> openllm.MetadataOutput:
],
'num_tokens': 20
}))
async def embeddings_v1(phrases: list[str]) -> openllm.EmbeddingsOutput:
async def embeddings_v1(phrases: list[str]) -> list[openllm.EmbeddingsOutput]:
embed_call: _EmbeddingMethod = runner.embeddings if runner.supports_embeddings else generic_embedding_runner.encode # type: ignore[type-arg,assignment,valid-type]
responses = (await embed_call.async_run(phrases))[0]
return openllm.EmbeddingsOutput(embeddings=responses['embeddings'], num_tokens=responses['num_tokens'])
return await embed_call.async_run(phrases)
if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent():
if runner.supports_hf_agent:
async def hf_agent(request: Request) -> Response:
json_str = await request.body()

View File

@@ -128,19 +128,19 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any],
quantize: LiteralString | None,
adapter_map: dict[str, str | None] | None,
dockerfile_template: str | None,
serialisation_format: t.Literal['safetensors', 'legacy'],
serialisation: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy())
env: openllm_core.utils.EnvVarMixin = llm.config['env']
if env['backend_value'] == 'vllm': serialisation_format = 'legacy'
if env['backend_value'] == 'vllm': serialisation = 'legacy'
env_dict = {
env.backend: env['backend_value'],
env.config: f"'{llm.config.model_dump_json().decode()}'",
env.model_id: f'/home/bentoml/bento/models/{llm.tag.path()}',
'OPENLLM_MODEL': llm.config['model_name'],
'OPENLLM_SERIALIZATION': serialisation_format,
'OPENLLM_SERIALIZATION': serialisation,
'OPENLLM_ADAPTER_MAP': f"'{orjson.dumps(adapter_map).decode()}'",
'BENTOML_DEBUG': str(True),
'BENTOML_QUIET': str(False),
@@ -207,7 +207,7 @@ def create_bento(bento_tag: bentoml.Tag,
dockerfile_template: str | None,
adapter_map: dict[str, str | None] | None = None,
extra_dependencies: tuple[str, ...] | None = None,
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
container_registry: LiteralContainerRegistry = 'ecr',
container_version_strategy: LiteralContainerVersionStrategy = 'release',
_bento_store: BentoStore = Provide[BentoMLContainer.bento_store],
@@ -246,7 +246,7 @@ def create_bento(bento_tag: bentoml.Tag,
quantize,
adapter_map,
dockerfile_template,
serialisation_format,
serialisation,
container_registry,
container_version_strategy))

View File

@@ -22,6 +22,7 @@ from bentoml._internal.configuration.containers import BentoMLContainer
from openllm_core._typing_compat import Concatenate
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralQuantise
from openllm_core._typing_compat import LiteralString
from openllm_core._typing_compat import ParamSpec
from openllm_core._typing_compat import get_literal_args
@@ -131,15 +132,15 @@ Available official model_id(s): [default: {llm_config['default_id']}]
model_version: str | None,
workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString,
device: t.Tuple[str, ...],
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
quantize: LiteralQuantise | None,
backend: LiteralBackend,
serialisation_format: t.Literal['safetensors', 'legacy'],
serialisation: t.Literal['safetensors', 'legacy'],
cors: bool,
adapter_id: str | None,
return_process: bool,
**attrs: t.Any,
) -> LLMConfig | subprocess.Popen[bytes]:
if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
if serialisation == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
termui.echo(
f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.",
fg='yellow')
@@ -184,11 +185,11 @@ Available official model_id(s): [default: {llm_config['default_id']}]
'BENTOML_DEBUG': str(openllm.utils.get_debug_mode()),
'BENTOML_HOME': os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()),
'OPENLLM_ADAPTER_MAP': orjson.dumps(adapter_map).decode(),
'OPENLLM_SERIALIZATION': serialisation_format,
env.backend: env['backend_value']
'OPENLLM_SERIALIZATION': serialisation,
env.backend: env['backend_value'],
})
if env['model_id_value']: start_env[env.model_id] = str(env['model_id_value'])
if quantize is not None: start_env[env.quantize] = str(t.cast(str, env['quantize_value']))
if env['quantize_value']: start_env[env.quantize] = str(env['quantize_value'])
llm = openllm.utils.infer_auto_class(env['backend_value']).for_model(model,
model_id=start_env[env.model_id],
@@ -196,7 +197,8 @@ Available official model_id(s): [default: {llm_config['default_id']}]
llm_config=config,
ensure_available=True,
adapter_map=adapter_map,
serialisation=serialisation_format)
quantize=env['quantize_value'],
serialisation=serialisation)
start_env.update({env.config: llm.config.model_dump_json().decode()})
server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs)
@@ -262,8 +264,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab
- DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/)
- GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml)
''',
),
'''),
quantize_option(factory=cog.optgroup),
serialisation_option(factory=cog.optgroup),
cog.optgroup.option('--device',
@@ -457,7 +458,7 @@ def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool =
def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option('--serialisation',
'--serialization',
'serialisation_format',
'serialisation',
type=click.Choice(['safetensors', 'legacy']),
default='safetensors',
show_default=True,

View File

@@ -26,6 +26,7 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralContainerRegistry
from openllm_core._typing_compat import LiteralContainerVersionStrategy
from openllm_core._typing_compat import LiteralQuantise
from openllm_core._typing_compat import LiteralString
logger = logging.getLogger(__name__)
@@ -37,7 +38,7 @@ def _start(model_name: str,
timeout: int = 30,
workers_per_resource: t.Literal['conserved', 'round_robin'] | float | None = None,
device: tuple[str, ...] | t.Literal['all'] | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
quantize: LiteralQuantise | None = None,
adapter_map: dict[LiteralString, str | None] | None = None,
backend: LiteralBackend | None = None,
additional_args: list[str] | None = None,
@@ -109,7 +110,7 @@ def _build(model_name: str,
model_id: str | None = None,
model_version: str | None = None,
bento_version: str | None = None,
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
quantize: LiteralQuantise | None = None,
adapter_map: dict[str, str | None] | None = None,
build_ctx: str | None = None,
enable_features: tuple[str, ...] | None = None,
@@ -120,7 +121,7 @@ def _build(model_name: str,
container_version_strategy: LiteralContainerVersionStrategy | None = None,
push: bool = False,
containerize: bool = False,
serialisation_format: t.Literal['safetensors', 'legacy'] = 'safetensors',
serialisation: t.Literal['safetensors', 'legacy'] = 'safetensors',
additional_args: list[str] | None = None,
bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> bentoml.Bento:
"""Package a LLM into a Bento.
@@ -160,14 +161,14 @@ def _build(model_name: str,
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_registry: Container registry to choose the base OpenLLM container image to build from. Default to ECR.
container_version_strategy: The container version strategy. Default to the latest release of OpenLLM.
serialisation_format: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
serialisation: Serialisation for saving models. Default to 'safetensors', which is equivalent to `safe_serialization=True`
additional_args: Additional arguments to pass to ``openllm build``.
bento_store: Optional BentoStore for saving this BentoLLM. Default to the default BentoML local store.
Returns:
``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format]
args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation]
if quantize: args.extend(['--quantize', quantize])
if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.")
if push: args.extend(['--push'])
@@ -203,8 +204,8 @@ def _import_model(model_name: str,
model_id: str | None = None,
model_version: str | None = None,
backend: LiteralBackend = 'pt',
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
serialisation_format: t.Literal['legacy', 'safetensors'] = 'safetensors',
quantize: LiteralQuantise | None = None,
serialisation: t.Literal['legacy', 'safetensors'] = 'safetensors',
additional_args: t.Sequence[str] | None = None) -> bentoml.Model:
"""Import a LLM into local store.
@@ -228,7 +229,7 @@ def _import_model(model_name: str,
- int8: Quantize the model with 8bit (bitsandbytes required)
- int4: Quantize the model with 4bit (bitsandbytes required)
- gptq: Quantize the model with GPTQ (auto-gptq required)
serialisation_format: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
serialisation: Type of model format to save to local store. If set to 'safetensors', then OpenLLM will save model using safetensors.
Default behaviour is similar to ``safe_serialization=False``.
additional_args: Additional arguments to pass to ``openllm import``.
@@ -236,7 +237,7 @@ def _import_model(model_name: str,
``bentoml.Model``:BentoModel of the given LLM. This can be used to serve the LLM or can be pushed to BentoCloud.
"""
from .entrypoint import import_command
args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation_format]
args = [model_name, '--backend', backend, '--machine', '--serialisation', serialisation]
if model_id is not None: args.append(model_id)
if model_version is not None: args.extend(['--model-version', str(model_version)])
if additional_args is not None: args.extend(additional_args)

View File

@@ -54,7 +54,6 @@ import openllm
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelStore
from openllm import bundle
from openllm import serialisation
from openllm.exceptions import OpenLLMException
from openllm.models.auto import CONFIG_MAPPING
from openllm.models.auto import MODEL_FLAX_MAPPING_NAMES
@@ -67,6 +66,7 @@ from openllm.utils import infer_auto_class
from openllm_core._typing_compat import Concatenate
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralQuantise
from openllm_core._typing_compat import LiteralString
from openllm_core._typing_compat import ParamSpec
from openllm_core._typing_compat import Self
@@ -84,7 +84,6 @@ from openllm_core.utils import first_not_none
from openllm_core.utils import get_debug_mode
from openllm_core.utils import get_quiet_mode
from openllm_core.utils import is_torch_available
from openllm_core.utils import is_transformers_supports_agent
from openllm_core.utils import resolve_user_filepath
from openllm_core.utils import set_debug_mode
from openllm_core.utils import set_quiet_mode
@@ -343,8 +342,8 @@ def import_command(
output: LiteralOutput,
machine: bool,
backend: LiteralBackend,
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
serialisation_format: t.Literal['safetensors', 'legacy'],
quantize: LiteralQuantise | None,
serialisation: t.Literal['safetensors', 'legacy'],
) -> bentoml.Model:
"""Setup LLM interactively.
@@ -369,7 +368,7 @@ def import_command(
\b
```bash
$ openllm download opt facebook/opt-2.7b
$ openllm import opt facebook/opt-2.7b
```
\b
@@ -400,17 +399,19 @@ def import_command(
env = EnvVarMixin(model_name, backend=llm_config.default_backend(), model_id=model_id, quantize=quantize)
backend = first_not_none(backend, default=env['backend_value'])
llm = infer_auto_class(backend).for_model(
model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False, serialisation=serialisation_format
model_name, model_id=env['model_id_value'], llm_config=llm_config, model_version=model_version, ensure_available=False,
quantize=env['quantize_value'],
serialisation=serialisation
)
_previously_saved = False
try:
_ref = serialisation.get(llm)
_ref = openllm.serialisation.get(llm)
_previously_saved = True
except openllm.exceptions.OpenLLMException:
if not machine and output == 'pretty':
msg = f"'{model_name}' {'with model_id='+ model_id if model_id is not None else ''} does not exists in local store for backend {llm.__llm_backend__}. Saving to BENTOML_HOME{' (path=' + os.environ.get('BENTOML_HOME', BentoMLContainer.bentoml_home.get()) + ')' if get_debug_mode() else ''}..."
termui.echo(msg, fg='yellow', nl=True)
_ref = serialisation.get(llm, auto_import=True)
_ref = openllm.serialisation.get(llm, auto_import=True)
if backend == 'pt' and is_torch_available() and torch.cuda.is_available(): torch.cuda.empty_cache()
if machine: return _ref
elif output == 'pretty':
@@ -472,7 +473,7 @@ def build_command(
bento_version: str | None,
overwrite: bool,
output: LiteralOutput,
quantize: t.Literal['int8', 'int4', 'gptq'] | None,
quantize: LiteralQuantise | None,
enable_features: tuple[str, ...] | None,
workers_per_resource: float | None,
adapter_id: tuple[str, ...],
@@ -483,7 +484,7 @@ def build_command(
dockerfile_template: t.TextIO | None,
containerize: bool,
push: bool,
serialisation_format: t.Literal['safetensors', 'legacy'],
serialisation: t.Literal['safetensors', 'legacy'],
container_registry: LiteralContainerRegistry,
container_version_strategy: LiteralContainerVersionStrategy,
force_push: bool,
@@ -517,12 +518,12 @@ def build_command(
# NOTE: We set this environment variable so that our service.py logic won't raise RuntimeError
# during build. This is a current limitation of bentoml build where we actually import the service.py into sys.path
try:
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation_format, 'OPENLLM_BACKEND': env['backend_value']})
os.environ.update({'OPENLLM_MODEL': inflection.underscore(model_name), 'OPENLLM_SERIALIZATION': serialisation, env.backend: env['backend_value']})
if env['model_id_value']: os.environ[env.model_id] = str(env['model_id_value'])
if env['quantize_value']: os.environ[env.quantize] = str(env['quantize_value'])
llm = infer_auto_class(env['backend_value']).for_model(
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, serialisation=serialisation_format, **attrs
model_name, model_id=env['model_id_value'], llm_config=llm_config, ensure_available=True, model_version=model_version, quantize=env['quantize_value'], serialisation=serialisation, **attrs
)
labels = dict(llm.identifying_params)
@@ -798,7 +799,6 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
except http.client.BadStatusLine:
raise click.ClickException(f'{endpoint} is neither a HTTP server nor reachable.') from None
if agent == 'hf':
if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
_memoized = {k: v[0] for k, v in _memoized.items() if v}
client._hf_agent.set_stream(logger.info)
if output != 'porcelain': termui.echo(f"Sending the following prompt ('{task}') with the following vars: {_memoized}", fg='magenta')

View File

@@ -49,7 +49,7 @@ class BaseAutoLLMClass:
```
'''
llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs)
if ensure_available: llm.ensure_model_id_exists()
if ensure_available: llm.save_pretrained()
return llm
@classmethod

View File

@@ -37,6 +37,7 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T:
from .transformers._helpers import process_config
config, *_ = process_config(llm._bentomodel.path, llm.trust_remote_code)
bentomodel_fs = fs.open_fs(llm._bentomodel.path)
if bentomodel_fs.isfile(CUSTOM_OBJECTS_FILENAME):
with bentomodel_fs.open(CUSTOM_OBJECTS_FILENAME, 'rb') as cofile:

View File

@@ -14,13 +14,14 @@ import openllm
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelOptions
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
from ._helpers import check_unintialised_params
from ._helpers import infer_autoclass_from_llm
from ._helpers import infer_tokenizers_from_llm
from ._helpers import make_model_signatures
from ._helpers import process_config
from ._helpers import update_model
from .weights import HfIgnore
if t.TYPE_CHECKING:
@@ -32,8 +33,6 @@ if t.TYPE_CHECKING:
from bentoml._internal.models import ModelStore
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import M
from openllm_core._typing_compat import T
else:
autogptq = openllm.utils.LazyLoader('autogptq', globals(), 'auto_gptq')
torch = openllm.utils.LazyLoader('torch', globals(), 'torch')
@@ -63,16 +62,23 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
"""
config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs)
_, tokenizer_attrs = llm.llm_parameters
quantize_method = llm._quantize_method
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors')
quantize = llm._quantize
safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation == 'safetensors')
# Disable safe serialization with vLLM
if llm.__llm_backend__ == 'vllm': safe_serialisation = False
metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method}
metadata: DictStrAny = {'safe_serialisation': safe_serialisation}
if quantize: metadata['_quantize'] = quantize
architectures = getattr(config, 'architectures', [])
if not architectures: raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
metadata['_pretrained_class'] = architectures[0]
signatures: DictStrAny = {}
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if quantize == 'gptq':
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
)
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
signatures['generate'] = {'batchable': False}
@@ -82,7 +88,8 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
if 'quantization_config' in attrs and getattr(attrs['quantization_config'], 'load_in_4bit', False):
attrs.pop('quantization_config')
if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation
metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__
metadata['_framework'] = llm.__llm_backend__
signatures.update(make_model_signatures(llm))
tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
@@ -95,42 +102,22 @@ def import_model(llm: openllm.LLM[M, T], *decls: t.Any, trust_remote_code: bool,
options=ModelOptions(),
context=openllm.utils.generate_context(framework_name='openllm'),
labels=openllm.utils.generate_labels(llm),
signatures=signatures if signatures else make_model_signatures(llm))
metadata=metadata,
signatures=signatures)
with openllm.utils.analytics.set_bentoml_tracking():
try:
bentomodel.enter_cloudpickle_context(external_modules, imported_modules)
tokenizer.save_pretrained(bentomodel.path)
if quantize_method == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
logger.debug('Saving model with GPTQ quantisation will require loading model into memory.')
model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
trust_remote_code=trust_remote_code,
use_safetensors=safe_serialisation,
**hub_attrs,
**attrs)
update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework})
model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation)
if llm._local:
# possible local path
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
else:
architectures = getattr(config, 'architectures', [])
if not architectures:
raise RuntimeError('Failed to determine the architecture for this model. Make sure the `config.json` is valid and can be loaded with `transformers.AutoConfig`')
architecture = architectures[0]
update_model(bentomodel, metadata={'_pretrained_class': architecture})
if llm._local:
# possible local path
logger.debug('Model will be loaded into memory to save to target store as it is from local path.')
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm.model_id, *decls, config=config, trust_remote_code=trust_remote_code, **hub_attrs, **attrs)
# for trust_remote_code to work
bentomodel.enter_cloudpickle_context([importlib.import_module(model.__module__)], imported_modules)
model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation)
else:
# we will clone the all tings into the bentomodel path without loading model into memory
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
# we will clone the all tings into the bentomodel path without loading model into memory
snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm))
except Exception:
raise
else:
@@ -165,29 +152,27 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model:
def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M:
config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs)
safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)),
attrs.pop('safe_serialization', None),
default=llm._serialisation_format == 'safetensors')
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available():
raise openllm.exceptions.OpenLLMException("GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'")
if llm.config['model_type'] != 'causal_lm':
raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path,
*decls,
quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config),
trust_remote_code=llm.trust_remote_code,
use_safetensors=safe_serialization,
**hub_attrs,
**attrs)
auto_class = infer_autoclass_from_llm(llm, config)
device_map: str | None = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
device_map = attrs.pop('device_map', 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None)
model = infer_autoclass_from_llm(llm, config).from_pretrained(llm._bentomodel.path,
*decls,
config=config,
trust_remote_code=llm.trust_remote_code,
device_map=device_map,
**hub_attrs,
**attrs).eval()
if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq':
if not openllm.utils.is_autogptq_available() or not openllm.utils.is_optimum_supports_gptq():
raise openllm.exceptions.OpenLLMException(
"GPTQ quantisation requires 'auto-gptq' and 'optimum' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/'"
)
if llm.config['model_type'] != 'causal_lm': raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})")
model = auto_class.from_pretrained(llm._bentomodel.path, device_map='auto', **hub_attrs, **attrs)
# TODO: Use the below logic once TheBloke finished migration to new GPTQConfig from transformers
# from accelerate import init_empty_weights
# from optimum.gptq import load_quantized_model
# # disable exllama if gptq is loaded on CPU
# disable_exllama = not torch.cuda.is_available()
# with init_empty_weights():
# empty = auto_class.from_pretrained(llm.model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map='auto')
# empty.tie_weights()
# model = load_quantized_model(empty, save_folder=llm._bentomodel.path, device_map='auto', disable_exllama=disable_exllama)
else:
model = auto_class.from_pretrained(llm._bentomodel.path, *decls, config=config, trust_remote_code=llm.trust_remote_code, device_map=device_map, **hub_attrs, **attrs).eval()
if llm.__llm_backend__ in {'pt', 'vllm'}: check_unintialised_params(model)
return t.cast('M', model)

View File

@@ -5,7 +5,6 @@ import typing as t
import openllm
import openllm_core
from bentoml._internal.models.model import ModelInfo
from bentoml._internal.models.model import ModelSignature
from openllm.serialisation.constants import FRAMEWORK_TO_AUTOCLASS_MAPPING
from openllm.serialisation.constants import HUB_ATTRS
@@ -16,8 +15,6 @@ if t.TYPE_CHECKING:
from transformers.models.auto.auto_factory import _BaseAutoModelClass
import bentoml
from bentoml._internal.models.model import ModelSignaturesType
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import M
@@ -25,8 +22,6 @@ if t.TYPE_CHECKING:
else:
transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch')
_object_setattr = object.__setattr__
def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]:
'''A helper function that correctly parse config and attributes for transformers.PretrainedConfig.
@@ -73,24 +68,6 @@ def check_unintialised_params(model: torch.nn.Module) -> None:
if len(unintialized) > 0:
raise RuntimeError(f'Found the following unintialized parameters in {model}: {unintialized}')
def update_model(bentomodel: bentoml.Model, metadata: DictStrAny) -> bentoml.Model:
based: DictStrAny = copy.deepcopy(bentomodel.info.metadata)
based.update(metadata)
_object_setattr(
bentomodel,
'_info',
ModelInfo( # type: ignore[call-arg] # XXX: remove me once upstream is merged
tag=bentomodel.info.tag,
module=bentomodel.info.module,
labels=bentomodel.info.labels,
options=bentomodel.info.options.to_dict(),
signatures=bentomodel.info.signatures,
context=bentomodel.info.context,
api_version=bentomodel.info.api_version,
creation_time=bentomodel.info.creation_time,
metadata=based))
return bentomodel
# NOTE: sync with bentoml/_internal/frameworks/transformers.py#make_default_signatures
def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType:
infer_fn: tuple[str, ...] = ('__call__',)

View File

@@ -11,11 +11,12 @@ import openllm
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralBackend
from openllm_core._typing_compat import LiteralQuantise
logger = logging.getLogger(__name__)
@contextlib.contextmanager
def build_bento(model: str, model_id: str | None = None, quantize: t.Literal['int4', 'int8', 'gptq'] | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
def build_bento(model: str, model_id: str | None = None, quantize: LiteralQuantise | None = None, cleanup: bool = False) -> t.Iterator[bentoml.Bento]:
logger.info('Building BentoML for %s', model)
bento = openllm.build(model, model_id=model_id, quantize=quantize)
yield bento

View File

@@ -19,13 +19,7 @@ if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralBackend
def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]:
return {
'backend': llm.__llm_backend__,
'framework': 'openllm',
'model_name': llm.config['model_name'],
'architecture': llm.config['architecture'],
'serialisation_format': llm._serialisation_format
}
return {'backend': llm.__llm_backend__, 'framework': 'openllm', 'model_name': llm.config['model_name'], 'architecture': llm.config['architecture'], 'serialisation': llm._serialisation}
def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]:
import openllm

View File

@@ -24,6 +24,7 @@ import openllm
from openllm._llm import normalise_model_name
from openllm_core._typing_compat import DictStrAny
from openllm_core._typing_compat import ListAny
from openllm_core._typing_compat import LiteralQuantise
logger = logging.getLogger(__name__)
@@ -141,14 +142,7 @@ class DockerHandle(_Handle):
return container.status in ['running', 'created']
@contextlib.contextmanager
def _local_handle(model: str,
model_id: str,
image_tag: str,
deployment_mode: t.Literal['container', 'local'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
def _local_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
with openllm.utils.reserve_free_port() as port:
pass
@@ -169,14 +163,7 @@ def _local_handle(model: str,
proc.stderr.close()
@contextlib.contextmanager
def _container_handle(model: str,
model_id: str,
image_tag: str,
deployment_mode: t.Literal['container', 'local'],
quantize: t.Literal['int8', 'int4', 'gptq'] | None = None,
*,
_serve_grpc: bool = False,
):
def _container_handle(model: str, model_id: str, image_tag: str, deployment_mode: t.Literal['container', 'local'], quantize: LiteralQuantise | None = None, *, _serve_grpc: bool = False):
envvar = openllm.utils.EnvVarMixin(model)
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: